package com.cloudera.sa.taxi360.sql.kudu

import com.cloudera.sa.taxi360.model.NyTaxiYellowTripBuilder
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduToNestedHDFS {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> " +
        "<kuduMaster> " +
        "<kuduTaxiTripTableName> " +
        "<hdfsTaxiNestedTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduTaxiTripTableName = args(2)
    val hdfsTaxiNestedTableName = args(3)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduTaxiTripTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load.
      registerTempTable("ny_taxi_trip_tmp")

    val kuduDataDf = hiveContext.sql("select * from ny_taxi_trip_tmp")

    val newNestedDf = kuduDataDf.map(r => {
      val pojo = NyTaxiYellowTripBuilder.build(r)
      (pojo.vender_id, pojo)
    }).groupByKey().map(grp => {
      Row(grp._1, grp._2.map(p => {
        Row(p.passenger_count,
          p.payment_type,
          p.total_amount,
          p.fare_amount)
      }))
    })

    hiveContext.sql("create table " + hdfsTaxiNestedTableName + "( " +
      " vender_id string," +
      " trip array<struct< " +
      "   passenger_count: INT," +
      "   payment_type: STRING, " +
      "   total_amount: DOUBLE, " +
      "   fare_amount: DOUBLE " +
      "  >>" +
      " ) stored as parquet")

    val emptyDf = hiveContext.sql("select * from " + hdfsTaxiNestedTableName + " limit 0")

    hiveContext.createDataFrame(newNestedDf, emptyDf.schema).registerTempTable("tmpNested")

    hiveContext.sql("insert into " + hdfsTaxiNestedTableName + " select * from tmpNested")

    sc.stop()
  }
}