java source code of KinesisConsumer

/*********************************************************************************************************************
#  Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.                                      #
#                                                                                                                    #
#  Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance    #
#  with the License. A copy of the License is located at                                                             #
#                                                                                                                    #
#      http://www.apache.org/licenses/LICENSE-2.0                                                                    #
#                                                                                                                    #
#  or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES #
#  OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions    #
#  and limitations under the License.                                                                                #
*********************************************************************************************************************/

package com.demo.consumer;

import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.text.SimpleDateFormat;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.Duration;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;

import org.apache.spark.streaming.kinesis.KinesisInitialPositions;
import org.apache.spark.streaming.kinesis.KinesisInputDStream;
import scala.reflect.ClassTag$;

import com.amazonaws.auth.DefaultAWSCredentialsProviderChain;
import com.amazonaws.client.builder.AwsClientBuilder.EndpointConfiguration;
import com.amazonaws.services.kinesis.AmazonKinesisClientBuilder;
import com.amazonaws.services.kinesis.AmazonKinesis;

import com.demo.model.Record;

public class KinesisConsumer {
    private static final Logger LOGGER = LoggerFactory.getLogger(KinesisConsumer.class);
    private static final String DELIMITER = ",";

    public static void main(String[] args) throws Exception {
        // Check that all required args were passed in
        if (args.length != 4) {
            System.err.println("Usage: KinesisConsumer <app-name> <stream-name> <region-name> <output-location>\n\n"
                + "    <app-name> is the name of the app, used to track the read data in DynamoDB\n"
                + "    <stream-name> is the name of the Kinesis stream\n"
                + "    <region-name> region where the Kinesis stream is created\n"
                + "    <output-location> bucket on S3 where the data should be stored.\n");
            System.exit(1);
        }

        // Populate the appropriate variables from the given args
        String kinesisAppName = args[0];
        String streamName = args[1];
        String regionName = args[2];
        String outputLocation = args[3];

        String endpointURL = "https://kinesis." + regionName + ".amazonaws.com";
        LOGGER.info("EndpointURL is " + endpointURL);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy/MM/dd/HH/mm");

        // Create a Kinesis client in order to determine the number of shards for the given stream
        AmazonKinesisClientBuilder clientBuilder = AmazonKinesisClientBuilder.standard()
            .withEndpointConfiguration(new EndpointConfiguration(endpointURL, regionName))
            .withCredentials(DefaultAWSCredentialsProviderChain.getInstance());

        AmazonKinesis kinesis = clientBuilder.build();
        int numShards = kinesis.describeStream(streamName).getStreamDescription().getShards().size();
        int numStreams = numShards;
        LOGGER.info("Number of shards is " + numShards);

        // Spark Streaming batch interval
        Duration batchInterval = Durations.minutes(1);

        // Kinesis checkpoint interval. Same as batchInterval for this example.
        Duration kinesisCheckpointInterval = batchInterval;

        // Setup the Spark config and StreamingContext
        SparkConf sparkConfig = new SparkConf().setAppName(kinesisAppName);
        JavaStreamingContext jssc = new JavaStreamingContext(sparkConfig, batchInterval);

        List<JavaDStream<byte[]>> streamsList = new ArrayList<>(numStreams);
        for (int i = 0; i < numStreams; i++) {
            streamsList.add(JavaDStream.fromDStream(
                KinesisInputDStream.builder()
                    .streamingContext(jssc)
                    .checkpointAppName(kinesisAppName)
                    .streamName(streamName)
                    .endpointUrl(endpointURL)
                    .regionName(regionName)
                    .initialPosition(new KinesisInitialPositions.Latest())
                    .checkpointInterval(kinesisCheckpointInterval)
                    .storageLevel(StorageLevel.MEMORY_AND_DISK_2())
                    .build(),
                ClassTag$.MODULE$.apply(byte[].class)
            ));
        }

        JavaDStream<byte[]> unionStreams;
        if (streamsList.size() > 1) {
            // Union all the streams if there is more than 1 stream
            LOGGER.info("Stream size is greater than 1");
            unionStreams = jssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size()));
        } else {
            // Otherwise, just use the 1 stream
            LOGGER.info("Stream size is equal to 1");
            unionStreams = streamsList.get(0);
        }

        // Convert each line of Array[Byte] to String
        JavaDStream<String> items = unionStreams.flatMap(new FlatMapFunction<byte[], String>() {
            @Override
            public Iterator<String> call(byte[] line) {
                String s = new String(line, StandardCharsets.UTF_8);
                return Arrays.asList(s.split("\n")).iterator();
            }
        });

        // Convert RDDs of the items DStream to DataFrame and run SQL query
        items.window(Durations.minutes(1)).foreachRDD((rdd, time) -> {
            LOGGER.info("========= Time is " + time + " =========");
            if (rdd.count() > 0) {
                String outPartitionFolder = sdf.format(time.milliseconds());
                SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());

                // Convert JavaRDD[String] to JavaRDD[Record]
                JavaRDD<Record> rowRDD = rdd.map(line -> {
                    String[] parts = line.split(DELIMITER);

                    Record record = new Record();
                    record.setZipcode(Integer.parseInt(parts[0]));
                    record.setProductName(parts[1]);
                    record.setPrice(Integer.parseInt(parts[2]));
                    record.setTimestamp(parts[3]);
                    return record;
                });

                // Creates a temporary view using the DataFrame
                Dataset<Row> recordsDataFrame = spark.createDataFrame(rowRDD, Record.class);
                recordsDataFrame.createOrReplaceTempView("records");

                // Query table using SQL and save results to S3
                // The coalesce method will make sure only one file is generated on outPartitionFolder
                // For larger datasets, you might use javaRDD().saveAsTextFile(...) so that multiple files are used
                Dataset<Row> products = spark.sql("SELECT productName, price FROM records WHERE price >= 40 AND price <= 50");
                products.javaRDD().coalesce(1).saveAsTextFile("s3://" + outputLocation + "/historical/" + outPartitionFolder, GzipCodec.class);
            }
        });

        // Start the streaming context and await termination
        jssc.start();
        jssc.awaitTermination();
    }
}

class JavaSparkSessionSingleton {
    private static volatile SparkSession instance = null;

    private JavaSparkSessionSingleton() { }

    public static SparkSession getInstance(SparkConf sparkConf) {
        if (instance == null) {
            synchronized (JavaSparkSessionSingleton.class) {
                if (instance == null) {
                    instance = SparkSession.builder().config(sparkConf).getOrCreate();
                }
            }
        }

        return instance;
    }
}