package org.infinispan.spark.examples.twitter

import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.infinispan.spark.examples.twitter.Sample.{getSparkConf, usage}
import org.infinispan.spark.rdd.InfinispanRDD

/**
  * This demo will group tweets by country and print the top 20 countries, using Spark SQL support.
  *
  * @author gustavonalle
  */
object SQLAggregationScala {

   def main(args: Array[String]) {
      if (args.length < 1) {
         usage("SQLAggregationScala")
      }

      Logger.getLogger("org").setLevel(Level.WARN)
      val infinispanHost = args(0)

      // Reduce the log level in the driver
      Logger.getLogger("org").setLevel(Level.WARN)

      // Create Spark Context
      val conf = getSparkConf("spark-infinispan-rdd-aggregation-scala")
      val sc = new SparkContext(conf)

      // Populate infinispan properties
      val config = Sample.getConnectorConf(infinispanHost)

      // Create RDD from infinispan data
      val infinispanRDD = new InfinispanRDD[Long, Tweet](sc, config)

      // Create a SQLContext, register a data frame and a temp table
      val valuesRDD = infinispanRDD.values
      val sparkSession = SparkSession.builder().config(conf).getOrCreate()
      val dataFrame = sparkSession.createDataFrame(valuesRDD, classOf[Tweet])
      dataFrame.createOrReplaceTempView("tweets")

      // Run the Query, collect and print results
      sparkSession.sql("SELECT country, count(*) as c from tweets WHERE country != 'N/A' GROUP BY country ORDER BY c desc")
        .collect().take(20).foreach(println)

   }

}