org.apache.spark.SparkFiles Scala Examples

The following examples show how to use org.apache.spark.SparkFiles. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: GrokHelper.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.s2jobs.utils

import io.thekraken.grok.api.Grok
import org.apache.s2graph.s2jobs.Logger
import org.apache.spark.SparkFiles
import org.apache.spark.sql.Row

import scala.collection.mutable

object GrokHelper extends Logger {
  private val grokPool:mutable.Map[String, Grok] = mutable.Map.empty

  def getGrok(name:String, patternFiles:Seq[String], patterns:Map[String, String], compilePattern:String):Grok = {
    if (grokPool.get(name).isEmpty) {
      println(s"Grok '$name' initialized..")
      val grok = new Grok()
      patternFiles.foreach { patternFile =>
        val filePath = SparkFiles.get(patternFile)
        println(s"[Grok][$name] add pattern file : $patternFile  ($filePath)")
        grok.addPatternFromFile(filePath)
      }
      patterns.foreach { case (name, pattern) =>
        println(s"[Grok][$name] add pattern : $name ($pattern)")
        grok.addPattern(name, pattern)
      }

      grok.compile(compilePattern)
      println(s"[Grok][$name] patterns: ${grok.getPatterns}")
      grokPool.put(name, grok)
    }

    grokPool(name)
  }

  def grokMatch(text:String)(implicit grok:Grok):Option[Map[String, String]] = {
    import scala.collection.JavaConverters._

    val m = grok.`match`(text)
    m.captures()
    val rstMap = m.toMap.asScala.toMap
      .filter(_._2 != null)
      .map{ case (k, v) =>  k -> v.toString}
    if (rstMap.isEmpty) None else Some(rstMap)
  }

  def grokMatchWithSchema(text:String)(implicit grok:Grok, keys:Array[String]):Option[Row] = {
    import scala.collection.JavaConverters._

    val m = grok.`match`(text)
    m.captures()

    val rstMap = m.toMap.asScala.toMap
    if (rstMap.isEmpty) None
    else {
      val l = keys.map { key => rstMap.getOrElse(key, null)}
      Some(Row.fromSeq(l))
    }
  }
} 
Example 2
Source File: L8-38SparkR.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import scala.reflect.runtime.universe
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import java.nio.file.Paths
import org.apache.spark.SparkFiles

object CdrStreamingSparkRApp {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 7) {
      System.err.println(
        "Usage: CdrStreamingSparkRApp <appname> <batchInterval> <hostname> <port> <tableName> <RScriptPath> <RScriptLogsPath>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val cl = Thread.currentThread().getContextClassLoader()
    val hiveC = new HiveContext(ssc.sparkContext)
    Thread.currentThread().setContextClassLoader(cl)

    import hiveC.implicits._

    ssc.sparkContext.addFile(rScriptPath)
    val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString)
    val master = hiveC.sparkContext.getConf.get("spark.master")

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD((rdd, time) => {
        val iTableName = tableName + time.milliseconds
        seqToCdr(rdd).toDF().write.saveAsTable(iTableName)
        hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString)
      })

    ssc.start()
    ssc.awaitTermination()
  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
} 
Example 3
Source File: SparkExample.scala    From Hands-On-Data-Analysis-with-Scala   with MIT License 5 votes vote down vote up
package handson.example.spark

import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkFiles
import vegas._
import vegas.sparkExt._



object SparkExample {
  def getSparkSession(): SparkSession = {
    val spark = SparkSession.builder().master("local").getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    spark
  }
  def main(args: Array[String]): Unit = {
    val spark = getSparkSession()

    spark.sparkContext.addFile("https://data.lacity.org/api/views/nxs9-385f/rows.csv")
    val df = spark.read.option("header", true).option("inferSchema", true).csv(SparkFiles.get("rows.csv"))
    df.printSchema()
    df.show()
    println("Covariance: " + df.stat.cov("Total Population", "Total Households"))
    println("Correlation: " + df.stat.corr("Total Population", "Total Households"))
    df.createOrReplaceTempView("tmp_data")
    val dfWithTier = spark.sql("select *, ntile(100) over(order by `Total Population`) tier from tmp_data")
    val dfTier90Plus = dfWithTier.where("tier >= 90")
    val plot = Vegas().withDataFrame(dfTier90Plus).encodeX("Zip Code", Nom).
      encodeY("Total Population", Quant).
      mark(Bar)
    plot.show

    spark.stop()
  }
} 
Example 4
Source File: LoadsContrib.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators.ner.dl

import java.io.{BufferedOutputStream, File, FileOutputStream}
import java.nio.file.Paths

import com.johnsnowlabs.nlp.util.io.ResourceHelper
import org.apache.commons.lang.SystemUtils
import org.apache.spark.SparkFiles
import org.apache.spark.sql.SparkSession
import org.tensorflow.TensorFlow

object LoadsContrib {
  @transient var loadedToCluster = false
  @transient var loadedToTensorflow = false

  private lazy val lib1 = "_sparse_feature_cross_op.so"
  private lazy val lib2 = "_lstm_ops.so"

  private def resourcePath(os: String, lib: String) = "ner-dl/"+os+"/"+lib

  
    if (!LoadsContrib.loadedToCluster && contribPaths.isDefined) {
      LoadsContrib.loadedToCluster = true
      spark.sparkContext.addFile(copyResourceToTmp(contribPaths.get._1).getPath)
      spark.sparkContext.addFile(copyResourceToTmp(contribPaths.get._2).getPath)
    }
  }

  def loadContribToTensorflow(): Unit = {
    if (!LoadsContrib.loadedToTensorflow && contribPaths.isDefined) {
      LoadsContrib.loadedToTensorflow = true
      val fp1 = SparkFiles.get(getFileName(contribPaths.get._1))
      val fp2 = SparkFiles.get(getFileName(contribPaths.get._2))
      if (new File(fp1).exists() && new File(fp2).exists()) {
        TensorFlow.loadLibrary(fp1)
        TensorFlow.loadLibrary(fp2)
      }
    }
  }

} 
Example 5
Source File: StorageHelper.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.storage

import java.io.File

import org.apache.hadoop.fs.{FileSystem, FileUtil, Path}
import org.apache.spark.{SparkContext, SparkFiles}
import org.apache.spark.sql.SparkSession


object StorageHelper {

  def resolveStorageName(database: String, storageRef: String): String = new Path(database + "_" + storageRef).toString

  def load(
            storageSourcePath: String,
            spark: SparkSession,
            database: String,
            storageRef: String,
            withinStorage: Boolean
          ): RocksDBConnection = {

    val dbFolder = StorageHelper.resolveStorageName(database.toString, storageRef)
    val src = StorageLocator.getStorageSerializedPath(storageSourcePath.replaceAllLiterally("\\", "/"), dbFolder, withinStorage)

    val locator = StorageLocator(database, storageRef, spark)

    sendToCluster(src, locator.clusterFilePath, locator.clusterFileName, locator.destinationScheme, spark.sparkContext)

    RocksDBConnection.getOrCreate(locator.clusterFileName)
  }

  def save(path: String, connection: RocksDBConnection, spark: SparkSession, withinStorage: Boolean): Unit = {
    val indexUri = "file://"+(new java.net.URI(connection.findLocalIndex.replaceAllLiterally("\\", "/")).getPath)
    val index = new Path(indexUri)

    val uri = new java.net.URI(path.replaceAllLiterally("\\", "/"))
    val fs = FileSystem.get(uri, spark.sparkContext.hadoopConfiguration)
    val dst = new Path(path+{if (withinStorage) "/storage/" else ""})

    save(fs, index, dst)
  }

  private def save(fs: FileSystem, index: Path, dst: Path): Unit = {
    if (!fs.exists(dst))
      fs.mkdirs(dst)
    fs.copyFromLocalFile(false, true, index, dst)
  }

  def sendToCluster(source: Path, clusterFilePath: Path, clusterFileName: String, destinationScheme: String, sparkContext: SparkContext): Unit = {
    if (destinationScheme == "file") {
      copyIndexToLocal(source, new Path(RocksDBConnection.getLocalPath(clusterFileName)), sparkContext)
    } else {
      copyIndexToCluster(source, clusterFilePath, sparkContext)
    }
  }

  private def copyIndexToCluster(sourcePath: Path, dst: Path, spark: SparkContext): String = {
    if (!new File(SparkFiles.get(dst.getName)).exists()) {
      val srcFS = sourcePath.getFileSystem(spark.hadoopConfiguration)
      val dstFS = dst.getFileSystem(spark.hadoopConfiguration)

      if (srcFS.getScheme == "file") {
        val src = sourcePath
        dstFS.copyFromLocalFile(false, true, src, dst)
      } else {
        FileUtil.copy(srcFS, sourcePath, dstFS, dst, false, true, spark.hadoopConfiguration)
      }

      spark.addFile(dst.toString, recursive = true)
    }
    dst.toString
  }

  private def copyIndexToLocal(source: Path, destination: Path, context: SparkContext): Unit = {
    
    val fs = source.getFileSystem(context.hadoopConfiguration)
    if (!fs.exists(destination))
      fs.copyFromLocalFile(false, true, source, destination)
  }

}