package com.cloudera.sa.examples.tablestats import com.cloudera.sa.examples.tablestats.model.{FirstPassStatsModel} import org.apache.spark._ import org.apache.spark.sql.DataFrame import scala.collection.mutable /** * Created by ted.malaska on 6/27/15. */ object TableStatsSinglePathMain { def main(args: Array[String]): Unit = { if (args.length == 0) { println("TableStatsSinglePathMain <inputPath>") return } val inputPath = args(0) val runLocal = (args.length == 2 && args(1).equals("L")) var sc:SparkContext = null if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") sc = new SparkContext(sparkConfig) } val sqlContext = new org.apache.spark.sql.SQLContext(sc) //Part A var df = sqlContext.parquetFile(inputPath) //Part B val firstPassStats = getFirstPassStat( df) //Part E println(firstPassStats) //Part F sc.stop() } def getFirstPassStat(df: DataFrame): FirstPassStatsModel = { val schema = df.schema //Part B.1 val columnValueCounts = df.flatMap(r => (0 until schema.length).map { idx => //((columnIdx, cellValue), count) ((idx, r.get(idx)), 1l) } ).reduceByKey(_ + _) //This is like word count //Part C val firstPassStats = columnValueCounts.mapPartitions[FirstPassStatsModel]{it => val firstPassStatsModel = new FirstPassStatsModel() it.foreach{ case ((columnIdx, columnVal), count) => firstPassStatsModel += (columnIdx, columnVal, count) } Iterator(firstPassStatsModel) }.reduce { (a, b) => //Part D a += (b) a } firstPassStats } }