package examples import org.apache.spark.{ SparkContext, SparkConf } import org.apache.spark.rdd.RDD import org.apache.spark.broadcast.Broadcast import scala.io.Source import scala.util.{ Try, Success, Failure } import scala.collection.mutable.Map /** * Test code for broadcast variables. * For more info read, * http://spark.apache.org/docs/latest/programming-guide.html#broadcast-variables. * * Here a map of countries along with their capitals are kept as a * broadcast variable. * This variable is used for arbitrary lookup for capitals of a country. * */ object TestBroadcastVariables { def main(args: Array[String]): Unit = { loadCSVFile("/media/linux-1/spark-dev/data/countries.csv") match { case Some(countries) => { val sc = new SparkContext(new SparkConf() .setAppName("TestBroadcastVariablesJob")) val countriesCache = sc.broadcast(countries) val countriesRDD = sc.parallelize(countries.keys.toList) // happy case... val happyCaseRDD = searchCountryDetails(countriesRDD, countriesCache, "A") println(">>>> Search results of countries starting with 'A': " + happyCaseRDD.count()) happyCaseRDD.foreach(entry => println("Country:" + entry._1 + ", Capital:" + entry._2)) // sad case... val sadCaseRDD = searchCountryDetails(countriesRDD, countriesCache, "Zz") println(">>>> Search results of countries starting with 'Zz': " + sadCaseRDD.count()) sadCaseRDD.foreach(entry => println("Country:" + entry._1 + ", Capital:" + entry._2)) } case None => println("Error loading file...") } } /** * Filters the input countries' RDD based on the search token and then * extracts their corresponding capitals from the broadcast variable. * Subsequently the searched countries and capitals are stored in a paired RDD * and returned to the caller. */ def searchCountryDetails(countriesRDD: RDD[String], countryCache: Broadcast[Map[String, String]], searchToken: String): RDD[(String, String)] = { countriesRDD.filter(_.startsWith(searchToken)) .map(country => (country, countryCache.value(country))) } /** * Loads a CSV file from disk. * Returns a map as (key=country, value=capital). * Returns Some(map) on Success or None on Failure. */ def loadCSVFile(filename: String): Option[Map[String, String]] = { val countries = Map[String, String]() Try { val bufferedSource = Source.fromFile(filename) for (line <- bufferedSource.getLines) { val Array(country, capital) = line.split(",").map(_.trim) countries += country -> capital } bufferedSource.close() return Some(countries) }.toOption } }