scala.collection.parallel.ForkJoinTaskSupport Scala Examples

The following examples show how to use scala.collection.parallel.ForkJoinTaskSupport. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: UnionRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 2
Source File: COCOSeqFileGenerator.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.models.utils

import com.intel.analytics.bigdl.dataset.segmentation.{COCODataset, COCOSerializeContext}
import java.io.File
import java.nio.file.{Files, Paths}
import java.util.concurrent.atomic.AtomicInteger
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.SequenceFile.Writer
import org.apache.hadoop.io.compress.BZip2Codec
import org.apache.hadoop.io.{BytesWritable, SequenceFile}
import scala.collection.parallel.ForkJoinTaskSupport
import scopt.OptionParser

object COCOSeqFileGenerator {

  
  case class COCOSeqFileGeneratorParams(
    folder: String = ".",
    metaPath: String = "instances_val2014.json",
    output: String = ".",
    parallel: Int = 1,
    blockSize: Int = 12800
  )

  private val parser = new OptionParser[COCOSeqFileGeneratorParams]("BigDL COCO " +
    "Sequence File Generator") {
    head("BigDL COCO Sequence File Generator")
    opt[String]('f', "folder")
      .text("where you put the COCO image files")
      .action((x, c) => c.copy(folder = x))
    opt[String]('o', "output folder")
      .text("where you put the generated seq files")
      .action((x, c) => c.copy(output = x))
    opt[Int]('p', "parallel")
      .text("parallel num")
      .action((x, c) => c.copy(parallel = x))
    opt[Int]('b', "blockSize")
      .text("block size")
      .action((x, c) => c.copy(blockSize = x))
    opt[String]('m', "metaPath")
      .text("metadata json file path")
      .action((x, c) => c.copy(metaPath = x))
  }

  def main(args: Array[String]): Unit = {
    parser.parse(args, COCOSeqFileGeneratorParams()).foreach { param =>
      println("Loading COCO metadata")
      val meta = COCODataset.load(param.metaPath, param.folder)
      println("Metadata loaded")
      val conf: Configuration = new Configuration
      val doneCount = new AtomicInteger(0)
      val tasks = meta.images.filter(img => {
        val path = img.path
        val valid = Files.exists(path) && !Files.isDirectory(path)
        if (!valid) {
          System.err.print(s"[Warning] The image file ${path.getFileName} does not exist.\n")
        }
        valid
      }).grouped(param.blockSize).zipWithIndex.toArray.par
      tasks.tasksupport = new ForkJoinTaskSupport(
        new scala.concurrent.forkjoin.ForkJoinPool(param.parallel))
      tasks.foreach { case (imgs, blkId) =>
        val outFile = new Path(param.output, s"coco-seq-$blkId.seq")
        val key = new BytesWritable
        val value = new BytesWritable
        val writer = SequenceFile.createWriter(conf, Writer.file(outFile), Writer.keyClass(key
          .getClass), Writer.valueClass(value.getClass), Writer.compression(SequenceFile
          .CompressionType.BLOCK, new BZip2Codec))
        val context = new COCOSerializeContext
        imgs.foreach { img =>
          context.clear()
          context.dump(img.fileName)
          img.dumpTo(context)
          context.dump(COCODataset.MAGIC_NUM)
          val keyBytes = context.toByteArray
          key.set(keyBytes, 0, keyBytes.length)
          val bytes = img.data
          value.set(bytes, 0, bytes.length)
          writer.append(key, value)
          val cnt = doneCount.incrementAndGet()
          if (cnt % 500 == 0) {
            System.err.print(s"\r$cnt / ${meta.images.length} = ${cnt.toFloat/meta.images.length}")
          }
        }
        writer.close()
      }
      System.err.print("\n")
    }
  }
} 
Example 3
Source File: SparkLaunch.scala    From spark-bench   with Apache License 2.0 5 votes vote down vote up
package com.ibm.sparktc.sparkbench.sparklaunch

import com.typesafe.config._
import java.io.File

import com.ibm.sparktc.sparkbench.sparklaunch.confparse.{ConfigWrangler, SparkJobConf}
import com.ibm.sparktc.sparkbench.sparklaunch.submission.livy.LivySubmit
import com.ibm.sparktc.sparkbench.sparklaunch.submission.sparksubmit.SparkSubmit

import scala.collection.parallel.ForkJoinTaskSupport
import scala.collection.JavaConverters._
import scala.util.Try

object SparkLaunch extends App {

  override def main(args: Array[String]): Unit = {
    assert(args.nonEmpty)
    val path = args.head
    val (confSeq: Seq[SparkJobConf], parallel: Boolean) = mkConfs(new File(path))

    launchJobs(confSeq, parallel)
  }

  def mkConfs(file: File): (Seq[SparkJobConf], Boolean) = {
    val config: Config = ConfigFactory.parseFile(file)
    val sparkBenchConfig = config.getObject("spark-bench").toConfig
    val confs: Seq[SparkJobConf] = ConfigWrangler(file)
    val parallel = Try(sparkBenchConfig.getBoolean("spark-submit-parallel")).getOrElse(false)
    (confs, parallel)
  }

  private def getConfigListByName(name: String, config: Config): List[Config] = {
    val workloadObjs: Iterable[ConfigObject] = config.getObjectList(name).asScala
    workloadObjs.map(_.toConfig).toList
  }

  def launchJobs(confSeq: Seq[SparkJobConf], parallel: Boolean): Unit = {

    def launch(conf: SparkJobConf): Unit = conf.submissionParams match {
      case s if ConfigWrangler.isLivySubmit(s) => LivySubmit().launch(conf)
      case s if ConfigWrangler.isSparkSubmit(s) => SparkSubmit.launch(conf)
    }

    if (parallel) {
      val confSeqPar = confSeq.par
      confSeqPar.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(confSeqPar.size))
      confSeqPar.foreach(launch)
    } else confSeq.foreach(launch)
  }
} 
Example 4
Source File: SuiteKickoff.scala    From spark-bench   with Apache License 2.0 5 votes vote down vote up
package com.ibm.sparktc.sparkbench.workload

import com.ibm.sparktc.sparkbench.utils.SparkFuncs._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.functions.{col, lit}

import scala.collection.parallel.ForkJoinTaskSupport



object SuiteKickoff {
  private val log = org.slf4j.LoggerFactory.getLogger(getClass)

  def run(s: Suite, spark: SparkSession): Unit = {
    verifyOutput(s.benchmarkOutput, s.saveMode, spark)

    // Translate the maps into runnable workloads
    val workloads: Seq[Workload] = s.workloadConfigs.map(ConfigCreator.mapToConf)

    val dataframes: Seq[DataFrame] = (0 until s.repeat).flatMap { i =>
      // This will produce one DataFrame of one row for each workload in the sequence.
      // We're going to produce one coherent DF later from these
      val dfSeqFromOneRun: Seq[DataFrame] = {
        if (s.parallel) runParallel(workloads, spark)
        else runSerially(workloads, spark)
      }
      // Indicate which run of this suite this was.
      dfSeqFromOneRun.map(_.withColumn("run", lit(i)))
    }

    // getting the Spark confs so we can output them in the results.
    val strSparkConfs = spark.conf.getAll

    // Ah, see, here's where we're joining that series of one-row DFs
    val singleDF = joinDataFrames(dataframes, spark)
    s.description.foreach(log.info)
    // And now we're going to curry in the results
    val plusSparkConf = addConfToResults(singleDF, strSparkConfs)
    val plusDescription = addConfToResults(plusSparkConf, Map("description" -> s.description)).coalesce(1)
    // And write to disk. We're done with this suite!
    if(s.benchmarkOutput.nonEmpty) writeToDisk(s.benchmarkOutput.get, s.saveMode, plusDescription, spark)
  }

  private def runParallel(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = {
    val confSeqPar = workloadConfigs.par
    confSeqPar.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(confSeqPar.size))
    confSeqPar.map(_.run(spark)).seq
  }

  private def runSerially(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = {
    workloadConfigs.map(_.run(spark))
  }

  private def joinDataFrames(seq: Seq[DataFrame], spark: SparkSession): DataFrame = {
    if (seq.length == 1) seq.head
    else {
      val seqOfColNames = seq.map(_.columns.toSet)
      val allTheColumns = seqOfColNames.foldLeft(Set[String]())(_ ++ _)

      def expr(myCols: Set[String], allCols: Set[String]) = {
        allCols.toList.map {
          case x if myCols.contains(x) => col(x)
          case x => lit(null).as(x)
        }
      }

      val seqFixedDfs = seq.map(df => df.select(expr(df.columns.toSet, allTheColumns): _*))

      // Folding left across this sequence should be fine because each DF should only have 1 row
      // Nevarr Evarr do this to legit dataframes that are all like big and stuff
      seqFixedDfs.foldLeft(spark.createDataFrame(spark.sparkContext.emptyRDD[Row], seqFixedDfs.head.schema))(_ union _)
    }
  }
} 
Example 5
Source File: MultipleSuiteKickoff.scala    From spark-bench   with Apache License 2.0 5 votes vote down vote up
package com.ibm.sparktc.sparkbench.workload

import org.apache.spark.sql.SparkSession

import scala.collection.parallel.ForkJoinTaskSupport

object MultipleSuiteKickoff {
  def run(seq: Seq[MultiSuiteRunConfig]): Unit = seq.foreach { contextConf =>
    val spark = createSparkContext(seq)
    if (contextConf.suitesParallel) runSuitesInParallel(contextConf.suites, spark)
    else runSuitesSerially(contextConf.suites, spark)
  }

  private def runSuitesInParallel(suiteSeq: Seq[Suite], spark: SparkSession): Unit = {
    val parallelSeq = suiteSeq.par
    parallelSeq.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(parallelSeq.size))
    parallelSeq.foreach(suite => SuiteKickoff.run(suite, spark))
  }

  private def runSuitesSerially(suiteSeq: Seq[Suite], spark: SparkSession): Unit =
    suiteSeq.foreach(SuiteKickoff.run(_, spark))

  private def createSparkContext(configs: Seq[MultiSuiteRunConfig]): SparkSession = {
    val builder = SparkSession.builder
    // if any configs have hive enabled, enable it for all
    val builderWithHive = if (configs.exists(_.enableHive)) builder.enableHiveSupport else builder
    builderWithHive.getOrCreate
  }
} 
Example 6
Source File: UnionRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 7
Source File: TestThreading.scala    From eidos   with Apache License 2.0 5 votes vote down vote up
package org.clulab.wm.eidos.utils

import org.clulab.wm.eidos.test.TestUtils._

import scala.collection.parallel.ForkJoinTaskSupport

class TestThreading extends Test {
  val threads = 26
  val numbers = 0.until(threads)
  val parNumbers = numbers.par
  val forkJoinPool = ThreadUtils.newForkJoinPool(threads)
  val forkJoinTaskSupport = new ForkJoinTaskSupport(forkJoinPool)

  // This should compile and output is not in order.
  parNumbers.tasksupport = forkJoinTaskSupport
  parNumbers.foreach { number =>
    println(number)
  }
} 
Example 8
Source File: UnionRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 9
Source File: UnionRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.ForkJoinTaskSupport
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 10
Source File: ProxyChecker.scala    From ProxyCrawler   with Apache License 2.0 5 votes vote down vote up
package org.crowdcrawler.proxycrawler.checker

import java.io.IOException
import java.net.SocketTimeoutException

import com.typesafe.scalalogging.Logger
import org.apache.http.annotation.ThreadSafe
import org.apache.http.conn.ConnectTimeoutException
import org.crowdcrawler.proxycrawler.ProxyInfo
import org.slf4j.LoggerFactory

import scala.collection.parallel.ForkJoinTaskSupport
import scala.concurrent.forkjoin.ForkJoinPool


@ThreadSafe
object ProxyChecker {
  private val LOGGER = Logger(LoggerFactory.getLogger(ProxyChecker.getClass.getName))


  
  private def check(proxyInfo: ProxyInfo): ProxyInfo = {
    val start = System.currentTimeMillis
    try {
      LOGGER.info("Executing request via proxy " + proxyInfo)
      val (statusCode, bytes) = proxyInfo.schema match {
        case "HTTP" =>
          HttpProxyChecker.check(proxyInfo.host, proxyInfo.port)
        case "HTTPS" =>
          HttpsProxyChecker.check(proxyInfo.host, proxyInfo.port)
        case "SOCKS" | "SOCKS4" | "SOCKS5" =>
          SocksProxyChecker.check(proxyInfo.host, proxyInfo.port)
        case other => throw new IllegalArgumentException("Unsupported schema " + other)
      }
      val end = System.currentTimeMillis
      LOGGER.info("Time elapsed " + (end - start) + " milliseconds")

      if (statusCode != 200) {
        LOGGER.error("HTTP status code is " + statusCode)
        ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, -1, proxyInfo.location, proxyInfo.from)
      } else {
        if (bytes > 0) {
          val speed = (bytes / ((end - start) / 1000.0)).toInt
          LOGGER.info("Speed is " + speed + " bytes/s")
          ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, speed, proxyInfo.location, proxyInfo.from)
        } else {
          LOGGER.error("HTTP status code is 200 but the proxy failed to retrieve HTML source code")
          if (proxyInfo.speed >= 0) {
            ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, -1, proxyInfo.location, proxyInfo.from)
          } else {
            ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, proxyInfo.speed - 1,
              proxyInfo.location, proxyInfo.from)
          }
        }
      }
    } catch {
      case e: IOException =>
        val end = System.currentTimeMillis
        if (e.isInstanceOf[ConnectTimeoutException] || e.isInstanceOf[SocketTimeoutException]) {
          LOGGER.info(e.getClass.getName + " : " + e.getMessage)
          LOGGER.info("Time elapsed " + (end - start) + " milliseconds")
        } else {
          LOGGER.error(e.getClass.getName + " : " + e.getMessage)
          LOGGER.error("Time elapsed " + (end - start) + " milliseconds")
        }

        if (proxyInfo.speed >= 0) {
          ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, -1, proxyInfo.location, proxyInfo.from)
        } else {
          ProxyInfo(proxyInfo.host, proxyInfo.port, proxyInfo.schema, proxyInfo.speed - 1,
            proxyInfo.location, proxyInfo.from)
        }
    }
  }

} 
Example 11
Source File: GetSourceMetadataAction.scala    From berilia   with Apache License 2.0 5 votes vote down vote up
package com.criteo.dev.cluster.source

import com.criteo.dev.cluster.Node
import com.criteo.dev.cluster.config.{GlobalConfig, TableConfig}
import com.criteo.dev.cluster.copy.GetMetadataAction

import scala.collection.parallel.ForkJoinTaskSupport
import scala.concurrent.forkjoin.ForkJoinPool
import scala.util.{Failure, Success, Try}


  def apply(tables: List[TableConfig], useLocalScheme: Boolean = config.source.isLocalScheme): List[Either[InvalidTable, FullTableInfo]] = {
    val conf = config.backCompat
    val getMetadata = new GetMetadataAction(config, conf, node)

    // configure parallel execution
    val parTables = tables.par
    parTables.tasksupport = new ForkJoinTaskSupport(new ForkJoinPool(config.source.parallelism.table))
    val (validTables, invalidTables) = parTables
      .map { table =>
        val (tableName, spec) = (table.name, (table.name :: table.partitions.map(_.mkString("(", ",", ")")).mkString(" ") :: Nil).mkString(" "))
        (tableName, spec, Try(getMetadata(spec, useLocalScheme)))
      }
      .toList
      .partition(_._3.isSuccess)
    val tableAndLocations = validTables
      .flatMap { case (_, _, Success(m)) =>
        if (m.partitions.size > 0)
          m.partitions.map(p => (m, p.location))
        else
          List((m, m.ddl.location.get))
      }
    tableAndLocations
      .zip(
        if (useLocalScheme)
          HDFSUtils.getFileSize(tableAndLocations.map(_._2))
        else
          HDFSUtils.getFileSize(tableAndLocations.map(_._2), node)
      )
      .groupBy { case ((m, _), _) => m }
      .foldLeft(List.empty[FullTableInfo]) { case (acc, (table, results)) =>
        FullTableInfo(
          table,
          TableHDFSInfo(
            table.database,
            table.ddl.table,
            results.map(_._2).sum,
            results.map(r => HDFSFileInfo(
              r._1._2,
              r._2
            )),
            table.partitions.size
          )
        ) :: acc
      }
      .map(Right(_)) ++
      invalidTables.map { case (tableName, spec, Failure(e)) =>
        Left(InvalidTable(tableName, spec, e.getMessage))
      }
  }
} 
Example 12
Source File: GetMetadataAction.scala    From berilia   with Apache License 2.0 5 votes vote down vote up
package com.criteo.dev.cluster.copy

import com.criteo.dev.cluster.Node
import com.criteo.dev.cluster.config.GlobalConfig
import org.slf4j.LoggerFactory

import scala.collection.parallel.ForkJoinTaskSupport
import scala.concurrent.forkjoin.ForkJoinPool


class GetMetadataAction(config: GlobalConfig, conf: Map[String, String], node : Node, throttle: Boolean = true) {

  private val logger = LoggerFactory.getLogger(classOf[GetMetadataAction])

  def apply(dbTablePartSpec: String, useLocalScheme: Boolean = config.source.isLocalScheme) : TableInfo = {
    //parse the configured source tables of form "$db.$table (part1=$part1, part2=$part2) (part1=$part1, part2=$part3)"
    val regex = """(\S*)\.(\S*)\s*(.*)""".r

    dbTablePartSpec match {
      case regex(db, table, partSpec) => {
        //1. Get the table metadata, like location, isPartitioned, and createStmt.
        val getTableMetadataAction = new GetTableMetadataAction(conf, node, useLocalScheme)
        val createTable = getTableMetadataAction(db, table)

        //2.  If partitioned, get the list of partitions.
        val partitionList: Array[String] =
          if (createTable.partitionedBy.length != 0) {
            if (partSpec.isEmpty) {
              ListPartitionAction(conf, node, useLocalScheme, db, table, None, throttle)
            } else {
              val parenRegex = """\((.*?)\)""".r
              val parPartSpecs = parenRegex.findAllIn(partSpec).toList.par
              parPartSpecs.tasksupport = new ForkJoinTaskSupport(new ForkJoinPool(config.source.parallelism.partition))
              parPartSpecs.flatMap(p =>
                ListPartitionAction(conf, node, useLocalScheme, db, table, Some(p), throttle)
              ).distinct.toArray
            }
          } else {
            Array.empty[String]
          }

        //3.  Get partitionSpec in model form.
        val partitionSpecList: Array[PartSpec] = partitionList.map(s => {
          CopyUtilities.getPartInfos(s)
        })

        //4.  Get partition locations as well
        val getPartitionAction = new GetPartitionMetadataAction(conf, node, useLocalScheme)
        val partitions = getPartitionAction(db, table, createTable, partitionSpecList)
        TableInfo(db, createTable.table, createTable, partitions)
      }
      case _ => throw new IllegalArgumentException(s"Cannot parse ${CopyConstants.sourceTables}: $dbTablePartSpec.  " +
        "Make sure it is of form $db.$table $partition, where $partition is optional and of form (part1='val1', part2='val2').")
    }
  }
}