org.apache.spark.util.LongAccumulator Scala Examples

The following examples show how to use org.apache.spark.util.LongAccumulator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: package.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import java.util.Collections

import scala.collection.JavaConverters._

import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.catalyst.trees.TreeNodeRef
import org.apache.spark.util.{AccumulatorV2, LongAccumulator}


    case class ColumnMetrics() {
      val elementTypes = new SetAccumulator[String]
      sparkContext.register(elementTypes)
    }

    val tupleCount: LongAccumulator = sparkContext.longAccumulator

    val numColumns: Int = child.output.size
    val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics())

    def dumpStats(): Unit = {
      debugPrint(s"== ${child.simpleString} ==")
      debugPrint(s"Tuples output: ${tupleCount.value}")
      child.output.zip(columnStats).foreach { case (attr, metric) =>
        // This is called on driver. All accumulator updates have a fixed value. So it's safe to use
        // `asScala` which accesses the internal values using `java.util.Iterator`.
        val actualDataTypes = metric.elementTypes.value.asScala.mkString("{", ",", "}")
        debugPrint(s" ${attr.name} ${attr.dataType}: $actualDataTypes")
      }
    }

    protected override def doExecute(): RDD[InternalRow] = {
      child.execute().mapPartitions { iter =>
        new Iterator[InternalRow] {
          def hasNext: Boolean = iter.hasNext

          def next(): InternalRow = {
            val currentRow = iter.next()
            tupleCount.add(1)
            var i = 0
            while (i < numColumns) {
              val value = currentRow.get(i, output(i).dataType)
              if (value != null) {
                columnStats(i).elementTypes.add(value.getClass.getName)
              }
              i += 1
            }
            currentRow
          }
        }
      }
    }

    override def outputPartitioning: Partitioning = child.outputPartitioning

    override def inputRDDs(): Seq[RDD[InternalRow]] = {
      child.asInstanceOf[CodegenSupport].inputRDDs()
    }

    override def doProduce(ctx: CodegenContext): String = {
      child.asInstanceOf[CodegenSupport].produce(ctx, this)
    }

    override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = {
      consume(ctx, input)
    }
  }
} 
Example 2
Source File: ShuffleWriteMetrics.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.executor

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.LongAccumulator



  def writeTime: Long = _writeTime.sum

  private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v)
  private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v)
  private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v)
  private[spark] def decBytesWritten(v: Long): Unit = {
    _bytesWritten.setValue(bytesWritten - v)
  }
  private[spark] def decRecordsWritten(v: Long): Unit = {
    _recordsWritten.setValue(recordsWritten - v)
  }

  // Legacy methods for backward compatibility.
  // TODO: remove these once we make this class private.
  @deprecated("use bytesWritten instead", "2.0.0")
  def shuffleBytesWritten: Long = bytesWritten
  @deprecated("use writeTime instead", "2.0.0")
  def shuffleWriteTime: Long = writeTime
  @deprecated("use recordsWritten instead", "2.0.0")
  def shuffleRecordsWritten: Long = recordsWritten

} 
Example 3
Source File: SparkCore_5_Accumulator.scala    From HadoopLearning   with MIT License 5 votes vote down vote up
package com.liumm.transform

import org.apache.spark.util.{CollectionAccumulator, DoubleAccumulator, LongAccumulator}
import org.apache.spark.{SparkConf, SparkContext}


object SparkCore_5_Accumulator {

  case class Info(var success: Boolean, var msg: String, count: Int)

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SparkCore_5_Accumulator").setMaster("local")
    val sc = new SparkContext(conf)

    val r = sc.parallelize(1 to 100)

    println("\n-----------------------传统方式累加------------------------")

    var total = 0
    r.foreach(num => {
      total += num
    })

    println(total)

    println("\n-----------------------注册LongAccumulator-------------------")

    //内置的累加器只支持 Long、Double、Collection
    val rdd = sc.parallelize(1 to 100, 4)

    //1、显示注册
    val longAccumulator = new LongAccumulator
    sc.register(longAccumulator)

    rdd.foreach(num => {
      longAccumulator.add(num)
    })
    println(longAccumulator.value)

    println("\n-----------------------直接使用LongAccumulator-------------------")
    //2、直接使用LongAccumulator,无需注册
    val longAccumulator1 = sc.longAccumulator
    rdd.foreach(num => {
      longAccumulator1.add(num)
    })
    println(longAccumulator1.value)

    println("\n-----------------------直接使用DoubleAccumulator-------------------")
    val rdd1 = sc.parallelize(List(1.1, 2.2, 3.3, 4.3))
    //1、显示注册
    val doubleAccumulator = new DoubleAccumulator
    sc.register(doubleAccumulator)
    rdd1.foreach(num => {
      doubleAccumulator.add(num)
    })
    println(doubleAccumulator.value)
    //2、直接使用
    val doubleAccumulator1 = sc.doubleAccumulator
    rdd1.foreach(num => {
      doubleAccumulator1.add(num)
    })
    println(doubleAccumulator1.value)

    println("\n-----------------------直接使用CollectionAccumulator-------------------")

    val rdd2 = sc.parallelize(1 to 100)
    //1、显示注册
    val collectionAccumulator = new CollectionAccumulator[Int]
    sc.register(collectionAccumulator)
    rdd2.foreach(num => {
      collectionAccumulator.add(num)
    })
    println(collectionAccumulator)

    //2、直接使用
    val collectionAccumulator1 = sc.collectionAccumulator[Int]
    rdd2.foreach(num => {
      collectionAccumulator.add(num)
    })
    println(collectionAccumulator1)

    println("\n-----------------------使用自定义累加器-------------------")
    val customAccumulator = new CustomAccumulator
    sc.register(customAccumulator)
    rdd.foreach(num => {
      customAccumulator.add(num.toString)
    })
    println(customAccumulator.value)

  }

} 
Example 4
Source File: Accumulators.scala    From spark-distcp   with Apache License 2.0 5 votes vote down vote up
package com.coxautodata.objects

import com.coxautodata.utils.FileUtils
import org.apache.spark.sql.SparkSession
import org.apache.spark.util.LongAccumulator

import scala.collection.JavaConverters._

class Accumulators(sparkSession: SparkSession) extends Serializable {

  def handleResult(result: DistCPResult): Unit = result match {
    case DeleteResult(_, DeleteActionResult.SkippedDoesNotExists | DeleteActionResult.SkippedDryRun) =>
      deleteOperationsSkipped.add(1)
    case DeleteResult(_, DeleteActionResult.Deleted) =>
      deleteOperationsSuccessful.add(1)
    case DeleteResult(_, DeleteActionResult.Failed(e)) =>
      deleteOperationsSkipped.add(1)
      deleteOperationsFailed.add(1)
      exceptionCount.add(e)
    case DirectoryCopyResult(_, _, CopyActionResult.SkippedAlreadyExists | CopyActionResult.SkippedDryRun) =>
      foldersSkipped.add(1)
    case DirectoryCopyResult(_, _, CopyActionResult.Created) =>
      foldersCreated.add(1)
    case DirectoryCopyResult(_, _, CopyActionResult.Failed(e)) =>
      foldersFailed.add(1)
      foldersSkipped.add(1)
      exceptionCount.add(e)
    case FileCopyResult(_, _, l, CopyActionResult.SkippedAlreadyExists | CopyActionResult.SkippedIdenticalFileAlreadyExists | CopyActionResult.SkippedDryRun) =>
      filesSkipped.add(1)
      bytesSkipped.add(l)
    case FileCopyResult(_, _, l, CopyActionResult.Copied) =>
      filesCopied.add(1)
      bytesCopied.add(l)
    case FileCopyResult(_, _, l, CopyActionResult.OverwrittenOrUpdated) =>
      filesCopied.add(1)
      bytesCopied.add(l)
      filesUpdatedOrOverwritten.add(1)
    case FileCopyResult(_, _, l, CopyActionResult.Failed(e)) =>
      filesFailed.add(1)
      exceptionCount.add(e)
      filesSkipped.add(1)
      bytesSkipped.add(l)
  }

  def getOutputText: String = {
    val intFormatter = java.text.NumberFormat.getIntegerInstance
    s"""--Raw data--
       |Data copied: ${FileUtils.byteCountToDisplaySize(bytesCopied.value)}
       |Data skipped (already existing files, dry-run and failures): ${FileUtils.byteCountToDisplaySize(bytesSkipped.value)}
       |--Files--
       |Files copied (new files and overwritten/updated files): ${intFormatter.format(filesCopied.value)}
       |Files overwritten/updated: ${intFormatter.format(filesUpdatedOrOverwritten.value)}
       |Skipped files for copying (already existing files, dry-run and failures): ${intFormatter.format(filesSkipped.value)}
       |Failed files during copy: ${intFormatter.format(filesFailed.value)}
       |--Folders--
       |Folders created: ${intFormatter.format(foldersCreated.value)}
       |Skipped folder creates (already existing folders, dry-run and failures): ${intFormatter.format(foldersSkipped.value)}
       |Failed folder creates: ${intFormatter.format(foldersFailed.value)}
       |--Deletes--
       |Successful delete operations: ${intFormatter.format(deleteOperationsSuccessful.value)}
       |Skipped delete operations (files/folders already missing, dry-run and failures): ${intFormatter.format(deleteOperationsSkipped.value)}
       |Failed delete operations: ${intFormatter.format(deleteOperationsFailed.value)}
       |--Exception counts--
       |""".stripMargin ++
      exceptionCount.value.asScala.toSeq.sortWith { case ((_, v1), (_, v2)) => v1 > v2 }.map { case (k, v) => s"$k: ${intFormatter.format(v)}" }.mkString("\n")
  }

  val bytesCopied: LongAccumulator = sparkSession.sparkContext.longAccumulator("BytesCopied")
  val bytesSkipped: LongAccumulator = sparkSession.sparkContext.longAccumulator("BytesSkipped") // Already exists, dryrun and failure

  val foldersCreated: LongAccumulator = sparkSession.sparkContext.longAccumulator("FoldersCreated")
  val foldersSkipped: LongAccumulator = sparkSession.sparkContext.longAccumulator("FoldersSkipped")
  val foldersFailed: LongAccumulator = sparkSession.sparkContext.longAccumulator("FoldersFailed")

  val filesCopied: LongAccumulator = sparkSession.sparkContext.longAccumulator("FilesCopied")
  val filesSkipped: LongAccumulator = sparkSession.sparkContext.longAccumulator("FilesSkipped") // Already exists, dryrun and failure
  val filesFailed: LongAccumulator = sparkSession.sparkContext.longAccumulator("FilesFailed")
  val filesUpdatedOrOverwritten: LongAccumulator = sparkSession.sparkContext.longAccumulator("FilesUpdatedOrOverwritten")

  val deleteOperationsSuccessful: LongAccumulator = sparkSession.sparkContext.longAccumulator("DeleteOperationsSuccessful")
  val deleteOperationsSkipped: LongAccumulator = sparkSession.sparkContext.longAccumulator("DeleteOperationsSkipped") // Already exists, dryrun and failure
  val deleteOperationsFailed: LongAccumulator = sparkSession.sparkContext.longAccumulator("DeleteOperationsFailed")

  val exceptionCount: ExceptionCountAccumulator = new ExceptionCountAccumulator
  sparkSession.sparkContext.register(exceptionCount, "ExceptionCount")
} 
Example 5
Source File: ShuffleWriteMetrics.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.executor

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.LongAccumulator



  def writeTime: Long = _writeTime.sum

  private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v)
  private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v)
  private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v)
  private[spark] def decBytesWritten(v: Long): Unit = {
    _bytesWritten.setValue(bytesWritten - v)
  }
  private[spark] def decRecordsWritten(v: Long): Unit = {
    _recordsWritten.setValue(recordsWritten - v)
  }

  // Legacy methods for backward compatibility.
  // TODO: remove these once we make this class private.
  @deprecated("use bytesWritten instead", "2.0.0")
  def shuffleBytesWritten: Long = bytesWritten
  @deprecated("use writeTime instead", "2.0.0")
  def shuffleWriteTime: Long = writeTime
  @deprecated("use recordsWritten instead", "2.0.0")
  def shuffleRecordsWritten: Long = recordsWritten

} 
Example 6
Source File: SparkUtil.scala    From Adenium   with Apache License 2.0 5 votes vote down vote up
package com.adenium.externals.spark

import com.adenium.utils.Logger
import com.adenium.utils.May.maybe
import com.adenium.utils.May.maybeInfo
import org.apache.spark.util.LongAccumulator


object SparkUtil {

  case class Accumulate ( acs : Seq[ LongAccumulator ] ) {

    def logStr: Seq[String ] = {
      acs map { ac => s"${ac.name.getOrElse( ac.id.toString )} = ${ac.value}" }
    }

    def log (): Unit = {
      acs foreach { ac => Logger.logInfo( ac.name.getOrElse( ac.id.toString) + " = " + ac.value )}
    }

    def add ( is: Seq[ Int] ): Unit = {
      acs.zip( is).foreach{ case (a, i) => maybe { a add i } } // swallow exception
    }

    def add( nls: Iterator[ Seq[ Int]]): Unit = {
      //ex: Iterator [Seq( ls, ncnt, bcnt, fcnt )]
      nls foreach { is => this.add( is) }
    }

    def add2sum(is: Seq[ Int], f: Seq[Int] => Int = _.sum ): Unit = {
      acs.zip( f(is) +: is).foreach { case (a, i) =>
        maybeInfo( a add i )("add2sum failed")
      }
    }
  }

} 
Example 7
Source File: ShuffleWriteMetrics.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.executor

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.LongAccumulator



  def writeTime: Long = _writeTime.sum

  private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v)
  private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v)
  private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v)
  private[spark] def decBytesWritten(v: Long): Unit = {
    _bytesWritten.setValue(bytesWritten - v)
  }
  private[spark] def decRecordsWritten(v: Long): Unit = {
    _recordsWritten.setValue(recordsWritten - v)
  }

  // Legacy methods for backward compatibility.
  // TODO: remove these once we make this class private.
  @deprecated("use bytesWritten instead", "2.0.0")
  def shuffleBytesWritten: Long = bytesWritten
  @deprecated("use writeTime instead", "2.0.0")
  def shuffleWriteTime: Long = writeTime
  @deprecated("use recordsWritten instead", "2.0.0")
  def shuffleRecordsWritten: Long = recordsWritten

} 
Example 8
Source File: ShuffleWriteMetrics.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.executor

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.LongAccumulator



  def writeTime: Long = _writeTime.sum

  private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v)
  private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v)
  private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v)
  private[spark] def decBytesWritten(v: Long): Unit = {
    _bytesWritten.setValue(bytesWritten - v)
  }
  private[spark] def decRecordsWritten(v: Long): Unit = {
    _recordsWritten.setValue(recordsWritten - v)
  }

  // Legacy methods for backward compatibility.
  // TODO: remove these once we make this class private.
  @deprecated("use bytesWritten instead", "2.0.0")
  def shuffleBytesWritten: Long = bytesWritten
  @deprecated("use writeTime instead", "2.0.0")
  def shuffleWriteTime: Long = writeTime
  @deprecated("use recordsWritten instead", "2.0.0")
  def shuffleRecordsWritten: Long = recordsWritten

} 
Example 9
Source File: 7_RecoverableNetworkWordCount.scala    From wow-spark   with MIT License 5 votes vote down vote up
package com.sev7e0.wow.spark_streaming

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext, Time}
import org.apache.spark.util.LongAccumulator
import org.apache.spark.{SparkConf, SparkContext}


object RecoverableNetworkWordCount {

  def main(args: Array[String]): Unit = {

    StreamingLogger.setLoggerLevel()

    val conf = new SparkConf().setMaster("local").setAppName(RecoverableNetworkWordCount.getClass.getName)
    val context = new StreamingContext(conf, Seconds(1))

    val linesDS = context.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_2)

    val wordsCounts = linesDS.flatMap(_.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)

    wordsCounts.foreachRDD((rdd: RDD[(String, Int)], time: Time) => {
      val blackList = WordBlackList.getInstance(context.sparkContext)

      val accumulator = DropWordCounter.getInstance(context.sparkContext)

      val str = rdd.filter { case (word, count) =>
        if (blackList.value.contains(word)) {
          accumulator.add(count)
          false
        } else {
          true
        }
      }.collect().mkString("[", ", ", "]")
      println(s"str = $str")
    })
  }


}

object WordBlackList {

  @volatile private var instance: Broadcast[Seq[String]] = _

  def getInstance(context: SparkContext): Broadcast[Seq[String]] = {
    if (instance == null) {
      synchronized {
        if (instance == null) {
          val blackList = Seq("a", "b", "c")
          instance = context.broadcast(blackList)
        }
      }
    }
    instance
  }

}

object DropWordCounter {
  @volatile private var instance: LongAccumulator = _

  def getInstance(context: SparkContext): LongAccumulator = {
    if (instance == null) {
      synchronized {
        if (instance == null) {
          instance = context.longAccumulator("WordCount")
        }
      }
    }
    instance
  }
} 
Example 10
Source File: GdeltTagger.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.tagging.gdelt

import java.text.SimpleDateFormat
import java.util.Date

import com.typesafe.config.ConfigFactory
import io.gzet.tagging.classifier.Classifier
import io.gzet.tagging.html.HtmlHandler
import io.gzet.tagging.html.HtmlHandler.Content
import org.apache.spark.Accumulator
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.util.LongAccumulator
import org.elasticsearch.spark._

class GdeltTagger() extends Serializable {

  val config = ConfigFactory.load().getConfig("io.gzet.kappa")
  val isoSdf = "yyyy-MM-dd HH:mm:ss"
  val esIndex = config.getString("gdeltIndex")
  val vectorSize = config.getInt("vectorSize")
  val minProba = config.getDouble("minProba")

  def predict(gdeltStream: DStream[String], batchId: LongAccumulator) = {

    // Extract HTML content
    val gdeltContent = fetchHtmlContent(gdeltStream)

    // Predict each RDD
    gdeltContent foreachRDD { batch =>

      batch.cache()
      val count = batch.count()

      if (count > 0) {

        if (Classifier.model.isDefined) {
          val labels = Classifier.model.get.labels

          // Predict HashTags using latest Twitter model
          val textRdd = batch.map(_.body.get)
          val predictions = Classifier.predictProbabilities(textRdd)
          val taggedGdelt = batch.zip(predictions) map { case (content, probabilities) =>
            val validLabels = probabilities filter { case (label, probability) =>
              probability > minProba
            }

            val labels = validLabels.toSeq
              .sortBy(_._2)
              .reverse
              .map(_._1)

            (content, labels)
          }

          // Saving articles to Elasticsearch
          taggedGdelt map { case (content, hashTags) =>
            gdeltToJson(content, hashTags.toArray)
          } saveToEs esIndex

        } else {

          // Saving articles to Elasticsearch
          batch map { content =>
            gdeltToJson(content, Array())
          } saveToEs esIndex
        }

      }

      batch.unpersist(blocking = false)
    }
  }

  private def gdeltToJson(content: Content, hashTags: Array[String]) = {
    val sdf = new SimpleDateFormat(isoSdf)
    Map(
      "time" -> sdf.format(new Date()),
      "body" -> content.body.get,
      "url" -> content.url,
      "tags" -> hashTags,
      "title" -> content.title
    )
  }

  private def fetchHtmlContent(urlStream: DStream[String]) = {
    urlStream.map(_ -> 1).groupByKey().map(_._1) mapPartitions { urls =>
      val sdf = new SimpleDateFormat(isoSdf)
      val htmlHandler = new HtmlHandler()
      val goose = htmlHandler.getGooseScraper
      urls map { url =>
        htmlHandler.fetchUrl(goose, url, sdf)
      }
    } filter { content =>
      content.isDefined &&
        content.get.body.isDefined &&
        content.get.body.get.length > 255
    } map { content =>
      content.get
    }
  }
}