package es.aconde.structured;

import com.twitter.bijection.Injection;
import com.twitter.bijection.avro.GenericAvroCodecs;
import java.util.concurrent.TimeoutException;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.log4j.Level;
import org.apache.log4j.LogManager;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.streaming.StreamingQuery;
import org.apache.spark.sql.streaming.StreamingQueryException;
import org.apache.spark.sql.types.StructType;
import static org.apache.spark.sql.functions.*;
import static org.apache.spark.sql.avro.functions.*;
import org.apache.spark.sql.avro.SchemaConverters;

/**
 * Structured streaming demo using Avro'ed Kafka topic as input
 *
 * @author Angel Conde
 */
public class StructuredDemo {

    private static final String USER_SCHEMA = "{"
            + "\"type\":\"record\","
            + "\"name\":\"myrecord\","
            + "\"fields\":["
            + "  { \"name\":\"str1\", \"type\":\"string\" },"
            + "  { \"name\":\"str2\", \"type\":\"string\" },"
            + "  { \"name\":\"int1\", \"type\":\"int\" },"
            + "  { \"name\": \"packet_info\", \"type\": {"
            + "        \"type\": \"record\","
            + "        \"name\": \"packet_data\","
            + "        \"fields\": ["
            + "              { \"name\": \"demo\", \"type\": \"string\" }"
            + "          ]}}"
            + "]}";
    private static Schema.Parser parser = new Schema.Parser();
    private static Schema schema = parser.parse(USER_SCHEMA);

    public static void main(String[] args) throws StreamingQueryException, TimeoutException {
        //set log4j programmatically
        LogManager.getLogger("org.apache.spark").setLevel(Level.WARN);
        LogManager.getLogger("org.apache.kafka").setLevel(Level.WARN);
        LogManager.getLogger("akka").setLevel(Level.ERROR);
        //on windows we may need to configure winutils if hadoop_home is not set
        //System.setProperty("hadoop.home.dir", "c:/app/hadoop");
        //configure Spark
        SparkConf conf = new SparkConf()
                .setAppName("kafka-structured")
                .set("spark.driver.bindAddress", "localhost")
                .setMaster("local[*]");

        //initialize spark session
        SparkSession sparkSession = SparkSession
                .builder()
                .config(conf)
                .getOrCreate();

        //reduce task number
        sparkSession.sqlContext().setConf("spark.sql.shuffle.partitions", "3");

        //data stream from kafka
        Dataset<Row> ds1 = sparkSession
                .readStream()
                .format("kafka")
                .option("kafka.bootstrap.servers", "localhost:9092")
                .option("subscribe", "mytopic")
                .option("startingOffsets", "earliest")
                .load();
        //print kafka schema
        ds1.printSchema();
        
        //start the streaming query
        Dataset<Row> ds2 = ds1
                .select(from_avro(col("value"), USER_SCHEMA).as("rows"))
                .select("rows.*");

        //print avro schema converted to dataframe :)
        ds2.printSchema();

        StreamingQuery query1 = ds2
                .groupBy("str1")
                .count()
                .writeStream()
                .queryName("Test query")
                .outputMode("complete")
                .format("console")
                .start();

        query1.awaitTermination();

    }

}