java.io.ObjectOutputStream Scala Examples

The following examples show how to use java.io.ObjectOutputStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: QueueInputDStream.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.dstream

import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream}

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.{StreamingContext, Time}

private[streaming]
class QueueInputDStream[T: ClassTag](
    ssc: StreamingContext,
    val queue: Queue[RDD[T]],
    oneAtATime: Boolean,
    defaultRDD: RDD[T]
  ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    throw new NotSerializableException("queueStream doesn't support checkpointing. " +
      "Please don't use queueStream when checkpointing is enabled.")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    queue.synchronized {
      if (oneAtATime && queue.nonEmpty) {
        buffer += queue.dequeue()
      } else {
        buffer ++= queue
        queue.clear()
      }
    }
    if (buffer.nonEmpty) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(context.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      Some(ssc.sparkContext.emptyRDD)
    }
  }

} 
Example 2
Source File: DStreamCheckpointData.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.dstream

import java.io.{IOException, ObjectInputStream, ObjectOutputStream}

import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.internal.Logging
import org.apache.spark.streaming.Time
import org.apache.spark.util.Utils

private[streaming]
class DStreamCheckpointData[T: ClassTag](dstream: DStream[T])
  extends Serializable with Logging {
  protected val data = new HashMap[Time, AnyRef]()

  // Mapping of the batch time to the checkpointed RDD file of that time
  @transient private var timeToCheckpointFile = new HashMap[Time, String]
  // Mapping of the batch time to the time of the oldest checkpointed RDD
  // in that batch's checkpoint data
  @transient private var timeToOldestCheckpointFileTime = new HashMap[Time, Time]

  @transient private var fileSystem: FileSystem = null
  protected[streaming] def currentCheckpointFiles = data.asInstanceOf[HashMap[Time, String]]

  
  def restore() {
    // Create RDDs from the checkpoint data
    currentCheckpointFiles.foreach {
      case(time, file) =>
        logInfo("Restoring checkpointed RDD for time " + time + " from file '" + file + "'")
        dstream.generatedRDDs += ((time, dstream.context.sparkContext.checkpointFile[T](file)))
    }
  }

  override def toString: String = {
    "[\n" + currentCheckpointFiles.size + " checkpoint files \n" +
      currentCheckpointFiles.mkString("\n") + "\n]"
  }

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    logDebug(this.getClass().getSimpleName + ".writeObject used")
    if (dstream.context.graph != null) {
      dstream.context.graph.synchronized {
        if (dstream.context.graph.checkpointInProgress) {
          oos.defaultWriteObject()
        } else {
          val msg = "Object of " + this.getClass.getName + " is being serialized " +
            " possibly as a part of closure of an RDD operation. This is because " +
            " the DStream object is being referred to from within the closure. " +
            " Please rewrite the RDD operation inside this DStream to avoid this. " +
            " This has been enforced to avoid bloating of Spark tasks " +
            " with unnecessary objects."
          throw new java.io.NotSerializableException(msg)
        }
      }
    } else {
      throw new java.io.NotSerializableException(
        "Graph is unexpectedly null when DStream is being serialized.")
    }
  }

  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    logDebug(this.getClass().getSimpleName + ".readObject used")
    ois.defaultReadObject()
    timeToOldestCheckpointFileTime = new HashMap[Time, Time]
    timeToCheckpointFile = new HashMap[Time, String]
  }
} 
Example 3
Source File: SerializableConfiguration.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.conf.Configuration

private[spark]
class SerializableConfiguration(@transient var value: Configuration) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new Configuration(false)
    value.readFields(in)
  }
} 
Example 4
Source File: SerializableJobConf.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.mapred.JobConf

private[spark]
class SerializableJobConf(@transient var value: JobConf) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new JobConf(false)
    value.readFields(in)
  }
} 
Example 5
Source File: SerializableBuffer.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
} 
Example 6
Source File: CartesianRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.util.Utils

private[spark]
class CartesianPartition(
    idx: Int,
    @transient private val rdd1: RDD[_],
    @transient private val rdd2: RDD[_],
    s1Index: Int,
    s2Index: Int
  ) extends Partition {
  var s1 = rdd1.partitions(s1Index)
  var s2 = rdd2.partitions(s2Index)
  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    s1 = rdd1.partitions(s1Index)
    s2 = rdd2.partitions(s2Index)
    oos.defaultWriteObject()
  }
}

private[spark]
class CartesianRDD[T: ClassTag, U: ClassTag](
    sc: SparkContext,
    var rdd1 : RDD[T],
    var rdd2 : RDD[U])
  extends RDD[(T, U)](sc, Nil)
  with Serializable {

  val numPartitionsInRdd2 = rdd2.partitions.length

  override def getPartitions: Array[Partition] = {
    // create the cross product split
    val array = new Array[Partition](rdd1.partitions.length * rdd2.partitions.length)
    for (s1 <- rdd1.partitions; s2 <- rdd2.partitions) {
      val idx = s1.index * numPartitionsInRdd2 + s2.index
      array(idx) = new CartesianPartition(idx, rdd1, rdd2, s1.index, s2.index)
    }
    array
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    (rdd1.preferredLocations(currSplit.s1) ++ rdd2.preferredLocations(currSplit.s2)).distinct
  }

  override def compute(split: Partition, context: TaskContext): Iterator[(T, U)] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    for (x <- rdd1.iterator(currSplit.s1, context);
         y <- rdd2.iterator(currSplit.s2, context)) yield (x, y)
  }

  override def getDependencies: Seq[Dependency[_]] = List(
    new NarrowDependency(rdd1) {
      def getParents(id: Int): Seq[Int] = List(id / numPartitionsInRdd2)
    },
    new NarrowDependency(rdd2) {
      def getParents(id: Int): Seq[Int] = List(id % numPartitionsInRdd2)
    }
  )

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
  }
} 
Example 7
Source File: UnionRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 8
Source File: PartitionerAwareUnionRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.nonEmpty)
  require(rdds.forall(_.partitioner.isDefined))
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map { index =>
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }.toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) =>
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
    }
    val location = if (locations.isEmpty) {
      None
    } else {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 9
Source File: DnnStorage.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.tensor

import java.io.{IOException, ObjectInputStream, ObjectOutputStream}
import com.intel.analytics.bigdl.mkl.Memory
import com.intel.analytics.bigdl.nn.mkldnn.MemoryOwner
import scala.reflect._


private[bigdl] class Pointer(val address: Long)

object DnnStorage {
  private[tensor] val CACHE_LINE_SIZE = System.getProperty("bigdl.cache.line", "64").toInt
  private[bigdl] val FLOAT_BYTES: Int = 4
  private[bigdl] val INT8_BYTES: Int = 1
  private[bigdl] val INT_BYTES: Int = 4

  import java.util.concurrent.ConcurrentHashMap
  private val nativeStorages: ConcurrentHashMap[Long, Boolean] = new ConcurrentHashMap()

  def checkAndSet(pointer: Long): Boolean = {
    nativeStorages.replace(pointer, false, true)
  }

  def add(key: Long): Unit = {
    nativeStorages.put(key, false)
  }

  def get(): Map[Long, Boolean] = {
    import scala.collection.JavaConverters._
    nativeStorages.asScala.toMap
  }
} 
Example 10
Source File: JavaSerializationConverter.scala    From scala-serialization   with MIT License 5 votes vote down vote up
package com.komanov.serialization.converters

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

import com.komanov.serialization.converters.IoUtils.using
import com.komanov.serialization.domain.{Site, SiteEvent, SiteEventData}

object JavaSerializationConverter extends MyConverter {

  override def toByteArray(site: Site): Array[Byte] = {
    using(new ByteArrayOutputStream()) { baos =>
      using(new ObjectOutputStream(baos)) { os =>
        os.writeObject(site)
        os.flush()
        baos.toByteArray
      }
    }
  }

  override def fromByteArray(bytes: Array[Byte]): Site = {
    using(new ByteArrayInputStream(bytes)) { bais =>
      using(new ObjectInputStream(bais)) { os =>
        os.readObject().asInstanceOf[Site]
      }
    }
  }

  override def toByteArray(event: SiteEvent): Array[Byte] = {
    using(new ByteArrayOutputStream()) { baos =>
      using(new ObjectOutputStream(baos)) { os =>
        os.writeObject(event)
        os.flush()
        baos.toByteArray
      }
    }
  }

  override def siteEventFromByteArray(clazz: Class[_], bytes: Array[Byte]): SiteEvent = {
    using(new ByteArrayInputStream(bytes)) { bais =>
      using(new ObjectInputStream(bais)) { os =>
        os.readObject().asInstanceOf[SiteEvent]
      }
    }
  }

} 
Example 11
Source File: TestableQueueInputDStream.scala    From SparkUnitTestingExamples   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.dstream.InputDStream

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

class TestableQueueInputDStream[T: ClassTag](
                                              ssc: StreamingContext,
                                              val queue: Queue[RDD[T]],
                                              oneAtATime: Boolean,
                                              defaultRDD: RDD[T]
                                              ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    queue.synchronized {
      if (oneAtATime && queue.nonEmpty) {
        buffer += queue.dequeue()
      } else {
        buffer ++= queue
        queue.clear()
      }
    }
    if (buffer.nonEmpty) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(context.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      Some(ssc.sparkContext.emptyRDD)
    }
  }

} 
Example 12
Source File: File.scala    From nescala   with GNU General Public License v2.0 5 votes vote down vote up
package com.owlandrews.nescala.helpers

import com.owlandrews.nescala.Console

object File {
  import java.io.File
  import java.net.URL
  import java.io.{FileFilter, FileInputStream, FileOutputStream, ObjectInputStream, ObjectOutputStream}
  import javax.imageio.ImageIO

  import scala.util.Try
  import scala.xml.XML
  import scala.language.postfixOps

  import sys.process._

  import com.typesafe.config.ConfigFactory

  def Download(url: String, filename: String) = (for{
    url <- Try(new URL(url))
    conn <- Try(url.openConnection().connect())
    file <- Try(new File(filename))
  } yield Try(url  #> file !!)) map {x => new File(filename)}

  def Writer(filename: String)(op: java.io.PrintWriter => Unit) = {
    val p = new java.io.PrintWriter(new File(filename))
    try op(p)
    finally p.close()
  }

  def Write(filename: String, content: String) = {
    val res = new java.io.PrintWriter(new File(filename))
    res.write(content)
    res.close()
  }

  def Filter = new FileFilter {
    override def accept(pathname: File): Boolean = pathname.getName.toLowerCase.endsWith(".nes")
  }

  def Image(file:Try[File]) = file.map(ImageIO.read)

  def Image(filename:String) = Try(ImageIO.read(resource(filename)))

  def Xml(filename:String) = XML.load(resource("/database.xml"))

  def Config(filename:String) = {
    val file = new File(filename)
    file.exists() match {
      case true => ConfigFactory.parseFile(file)
      case false => ConfigFactory.empty()
    }
  }

  def SaveState(console:Console) = {
    val fos = new FileOutputStream(s"$ApplicationFolder/${console.cartridge.CRC}.save")
    val oos = new ObjectOutputStream(fos)

    oos.writeObject(console)
    oos.close()
  }

  def LoadState(crc:String):Try[Console] = Try {
    val fis = new FileInputStream(s"$ApplicationFolder/$crc.save")
    val ois = new ObjectInputStreamWithCustomClassLoader(fis)

    val console = ois.readObject.asInstanceOf[Console]
    ois.close()
    console
  }

  // Taken from: https://gist.github.com/ramn/5566596
  private class ObjectInputStreamWithCustomClassLoader(fileInputStream: FileInputStream) extends ObjectInputStream(fileInputStream) {
    override def resolveClass(desc: java.io.ObjectStreamClass): Class[_] = {
      try { Class.forName(desc.getName, false, getClass.getClassLoader) }
      catch { case ex: ClassNotFoundException => super.resolveClass(desc) }
    }
  }

  lazy val ApplicationFolder: File = {
    val settingDirectory = System.getProperty("user.home") + "/.nescala"
    val settings = new java.io.File(settingDirectory)
    if (!settings.exists()) settings.mkdir()
    settings
  }

  private def resource(filename:String) = getClass.getResourceAsStream(filename)
} 
Example 13
Source File: HBasePartitioner.scala    From Backup-Repo   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import java.io.{IOException, ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.util.{CollectionsUtils, Utils}
import org.apache.spark.{Partitioner, SparkEnv}

object HBasePartitioner {
  implicit object HBaseRawOrdering extends Ordering[HBaseRawType] {
    def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b)
  }
}

class HBasePartitioner (var splitKeys: Array[HBaseRawType]) extends Partitioner {
  import HBasePartitioner.HBaseRawOrdering

  type t = HBaseRawType

  lazy private val len = splitKeys.length

  // For pre-split table splitKeys(0) = bytes[0], to remove it,
  // otherwise partition 0 always be empty and
  // we will miss the last region's date when bulk load
  lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail

  def numPartitions = if (len == 0) 1 else len

  @transient private val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t]

  def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[t]
    var partition = 0
    if (len <= 128 && len > 0) {
      // If we have less than 128 partitions naive search
      val ordering = implicitly[Ordering[t]]
      while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) {
        partition += 1
      }
    } else {
      // Determine which binary search method to use only once.
      partition = binarySearch(realSplitKeys, k)
      // binarySearch either returns the match location or -[insertion point]-1
      if (partition < 0) {
        partition = -partition - 1
      }
      if (partition > realSplitKeys.length) {
        partition = realSplitKeys.length
      }
    }
    partition
  }

  override def equals(other: Any): Boolean = other match {
    case r: HBasePartitioner =>
      r.splitKeys.sameElements(splitKeys)
    case _ =>
      false
  }

  override def hashCode(): Int = {
    val prime = 31
    var result = 1
    var i = 0
    while (i < splitKeys.length) {
      result = prime * result + splitKeys(i).hashCode
      i += 1
    }
    result = prime * result
    result
  }
} 
Example 14
Source File: WriSer.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.hadoop

import java.io.{ DataInputStream, DataOutputStream, ObjectInputStream, ObjectOutputStream }
import java.io.IOException

import scala.reflect.{ classTag, ClassTag }

import org.apache.hadoop.io.Writable

// Note: we could make this implement InputSplit, but we do not because many input splits do a
// cast to their specific InputSplit, so we do not want to risk it. Further, this currently works
// for any Writable.
case class WriSer[T <: Writable: ClassTag](@transient var get: T) extends Serializable {
  def this() = this(null.asInstanceOf[T])

  @throws(classOf[IOException])
  private def writeObject(out: ObjectOutputStream) {
    out.writeObject(classTag[T])
    get.write(new DataOutputStream(out))
  }

  @throws(classOf[IOException])
  @throws(classOf[ClassNotFoundException])
  private def readObject(in: ObjectInputStream) {
    get = in.readObject.asInstanceOf[ClassTag[T]].runtimeClass.newInstance.asInstanceOf[T]
    get.readFields(new DataInputStream(in))
  }
} 
Example 15
Source File: AggregatorTest.scala    From noether   with Apache License 2.0 5 votes vote down vote up
package com.spotify.noether

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

import com.twitter.algebird.Aggregator
import org.scalatest._
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

trait AggregatorTest extends AnyFlatSpec with Matchers {
  def run[A, B, C](aggregator: Aggregator[A, B, C])(as: Seq[A]): C = {
    val bs = as.map(aggregator.prepare _ compose ensureSerializable)
    val b = ensureSerializable(aggregator.reduce(bs))
    ensureSerializable(aggregator.present(b))
  }

  private def serializeToByteArray(value: Any): Array[Byte] = {
    val buffer = new ByteArrayOutputStream()
    val oos = new ObjectOutputStream(buffer)
    oos.writeObject(value)
    buffer.toByteArray
  }

  private def deserializeFromByteArray(encodedValue: Array[Byte]): AnyRef = {
    val ois = new ObjectInputStream(new ByteArrayInputStream(encodedValue))
    ois.readObject()
  }

  private def ensureSerializable[T](value: T): T =
    deserializeFromByteArray(serializeToByteArray(value)).asInstanceOf[T]
} 
Example 16
Source File: MapJoinPartitionsRDD.scala    From spark-vlbfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.util.Utils

class MapJoinPartitionsPartition(
    idx: Int,
    @transient private val rdd1: RDD[_],
    @transient private val rdd2: RDD[_],
    s2IdxArr: Array[Int]) extends Partition {

  var s1 = rdd1.partitions(idx)
  var s2Arr = s2IdxArr.map(s2Idx => rdd2.partitions(s2Idx))
  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    s1 = rdd1.partitions(idx)
    s2Arr = s2IdxArr.map(s2Idx => rdd2.partitions(s2Idx))
    oos.defaultWriteObject()
  }
}

class MapJoinPartitionsRDD[A: ClassTag, B: ClassTag, V: ClassTag](
    sc: SparkContext,
    var idxF: (Int) => Array[Int],
    var f: (Int, Iterator[A], Array[(Int, Iterator[B])]) => Iterator[V],
    var rdd1: RDD[A],
    var rdd2: RDD[B])
  extends RDD[V](sc, Nil) {

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdd1.partitions.length)
    for (s1 <- rdd1.partitions) {
      val idx = s1.index
      array(idx) = new MapJoinPartitionsPartition(idx, rdd1, rdd2, idxF(idx))
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = List(
    new OneToOneDependency(rdd1),
    new NarrowDependency(rdd2) {
      override def getParents(partitionId: Int): Seq[Int] = {
        idxF(partitionId)
      }
    }
  )

  override def getPreferredLocations(s: Partition): Seq[String] = {
    val fp = firstParent[A]
    // println(s"pref loc: ${fp.preferredLocations(fp.partitions(s.index))}")
    fp.preferredLocations(fp.partitions(s.index))
  }

  override def compute(split: Partition, context: TaskContext): Iterator[V] = {
    val currSplit = split.asInstanceOf[MapJoinPartitionsPartition]
    f(currSplit.s1.index, rdd1.iterator(currSplit.s1, context),
      currSplit.s2Arr.map(s2 => (s2.index, rdd2.iterator(s2, context)))
    )
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
    idxF = null
    f = null
  }
} 
Example 17
Source File: MapJoinPartitionsRDDV2.scala    From spark-vlbfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import org.apache.spark.serializer.Serializer
import org.apache.spark.{TaskContext, _}
import org.apache.spark.util.Utils

import scala.reflect.ClassTag

class MapJoinPartitionsPartitionV2(
    idx: Int,
    @transient private val rdd1: RDD[_],
    @transient private val rdd2: RDD[_],
    s2IdxArr: Array[Int]) extends Partition {

  var s1 = rdd1.partitions(idx)
  var s2Arr = s2IdxArr.map(s2Idx => rdd2.partitions(s2Idx))
  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    s1 = rdd1.partitions(idx)
    s2Arr = s2IdxArr.map(s2Idx => rdd2.partitions(s2Idx))
    oos.defaultWriteObject()
  }
}

class MapJoinPartitionsRDDV2[A: ClassTag, B: ClassTag, V: ClassTag](
    sc: SparkContext,
    var idxF: (Int) => Array[Int],
    var f: (Int, Iterator[A], Array[(Int, Iterator[B])]) => Iterator[V],
    var rdd1: RDD[A],
    var rdd2: RDD[B],
    preservesPartitioning: Boolean = false)
  extends RDD[V](sc, Nil) {

  var rdd2WithPid = rdd2.mapPartitionsWithIndex((pid, iter) => iter.map(x => (pid, x)))

  private val serializer: Serializer = SparkEnv.get.serializer

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdd1.partitions.length)
    for (s1 <- rdd1.partitions) {
      val idx = s1.index
      array(idx) = new MapJoinPartitionsPartitionV2(idx, rdd1, rdd2, idxF(idx))
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = List(
    new OneToOneDependency(rdd1),
    new ShuffleDependency[Int, B, B](
      rdd2WithPid.asInstanceOf[RDD[_ <: Product2[Int, B]]],
      new IdentityPartitioner(rdd2WithPid.getNumPartitions), serializer)
  )

  override def getPreferredLocations(s: Partition): Seq[String] = {
    val fp = firstParent[A]
    // println(s"pref loc: ${fp.preferredLocations(fp.partitions(s.index))}")
    fp.preferredLocations(fp.partitions(s.index))
  }

  override def compute(split: Partition, context: TaskContext): Iterator[V] = {
    val currSplit = split.asInstanceOf[MapJoinPartitionsPartitionV2]
    val rdd2Dep = dependencies(1).asInstanceOf[ShuffleDependency[Int, Any, Any]]
    val rdd2PartIter = currSplit.s2Arr.map(s2 => (s2.index,
      SparkEnv.get.shuffleManager
        .getReader[Int, B](rdd2Dep.shuffleHandle, s2.index, s2.index + 1, context)
        .read().map(x => x._2)
      ))
    val rdd1Iter = rdd1.iterator(currSplit.s1, context)
    f(currSplit.s1.index, rdd1Iter, rdd2PartIter)
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
    rdd2WithPid = null
    idxF = null
    f = null
  }
}

private[spark] class IdentityPartitioner(val numParts: Int) extends Partitioner {
  require(numPartitions > 0)
  override def getPartition(key: Any): Int = key.asInstanceOf[Int]
  override def numPartitions: Int = numParts
} 
Example 18
Source File: SapSQLContextSuite.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import java.io.{ByteArrayOutputStream, ObjectOutputStream}

import org.apache.spark.sql.parser.SapParserException
import org.apache.spark.util.DummyRelationUtils._
import org.mockito.Mockito
import org.scalatest.FunSuite

class SapSQLContextSuite extends FunSuite with GlobalSapSQLContext {

  test("SQL contexts do not support hive functions") {
    val rdd = sc.parallelize(Seq(Row("1"), Row("2")))
    sqlc.createDataFrame(rdd, 'a.string, needsConversion = false)
      .registerTempTable("foo")

    intercept[AnalysisException] {
      sqlc.sql("SELECT int(a) FROM foo")
    }
  }

  test ("Check Spark Version"){
     val sap_sqlc = sqlContext.asInstanceOf[CommonSapSQLContext]
     // current spark runtime version shall be supported
     sap_sqlc.checkSparkVersion(List(org.apache.spark.SPARK_VERSION))

     // runtime exception for an unsupported version
     intercept[RuntimeException]{
      sap_sqlc.checkSparkVersion(List("some.unsupported.version"))
     }
  }

  test("Slightly different versions") {
    val sap_sqlc = sqlContext.asInstanceOf[CommonSapSQLContext]
    val spy_sap_sqlc = Mockito.spy(sap_sqlc)
    Mockito.when(spy_sap_sqlc.getCurrentSparkVersion())
      .thenReturn(org.apache.spark.SPARK_VERSION + "-CDH")

    // should not throw!
    spy_sap_sqlc.checkSparkVersion(spy_sap_sqlc.supportedVersions)

    Mockito.when(spy_sap_sqlc.getCurrentSparkVersion())
      .thenReturn("something- " + org.apache.spark.SPARK_VERSION)

    // should not throw!
    spy_sap_sqlc.checkSparkVersion(spy_sap_sqlc.supportedVersions)
  }

  test("Ensure SapSQLContext stays serializable"){
    // relevant for Bug 92818
    // Remember that all class references in SapSQLContext must be serializable!
    val oos = new ObjectOutputStream(new ByteArrayOutputStream())
    oos.writeObject(sqlContext)
    oos.close()
  }

  test("Rand function") {
    sqlContext.sql(
      s"""
         |CREATE TABLE test (name varchar(20), age integer)
         |USING com.sap.spark.dstest
         |OPTIONS (
         |tableName "test"
         |)
       """.stripMargin)

    sqlContext.sql("SELECT * FROM test WHERE rand() < 0.1")
  }

  test("test version fields") {
    val sapSqlContext = sqlContext.asInstanceOf[CommonSapSQLContext]
    assert(sapSqlContext.EXTENSIONS_VERSION.isEmpty)
    assert(sapSqlContext.DATASOURCES_VERSION.isEmpty)
  }
} 
Example 19
Source File: SerializableSerializer.scala    From spark-util   with Apache License 2.0 5 votes vote down vote up
package org.hammerlab.hadoop.kryo

import java.io.{ ObjectInputStream, ObjectOutputStream }

import com.esotericsoftware.kryo.io.{ Input, Output }
import com.esotericsoftware.kryo.{ Kryo, Serializer }


case class SerializableSerializer[T <: Serializable]()
  extends Serializer[T] {
  override def read(kryo: Kryo, input: Input, `type`: Class[T]): T =
    new ObjectInputStream(input)
      .readObject()
      .asInstanceOf[T]

  override def write(kryo: Kryo, output: Output, t: T): Unit =
    new ObjectOutputStream(output)
      .writeObject(t)
} 
Example 20
Source File: Configuration.scala    From spark-util   with Apache License 2.0 5 votes vote down vote up
package org.hammerlab.hadoop

import java.io.{ ObjectInputStream, ObjectOutputStream }

import org.apache.hadoop.conf
import org.apache.hadoop.conf.{ Configuration ⇒ HadoopConfiguration }
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.hammerlab.hadoop.kryo.WritableSerializer
import org.hammerlab.kryo._


class Configuration(@transient var value: HadoopConfiguration)
  extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = {
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = {
    value = new HadoopConfiguration(false)
    value.readFields(in)
  }
}

object Configuration
  extends Registrar {

  def apply(loadDefaults: Boolean = true): Configuration =
    new HadoopConfiguration(loadDefaults)

  def apply(conf: HadoopConfiguration): Configuration =
    new Configuration(conf)

  implicit def wrapConfiguration(conf: HadoopConfiguration): Configuration =
    apply(conf)

  implicit def unwrapConfiguration(conf: Configuration): HadoopConfiguration =
    conf.value

  implicit def unwrapConfigurationBroadcast(confBroadcast: Broadcast[Configuration]): Configuration =
    confBroadcast.value

  implicit def sparkContextToHadoopConfiguration(sc: SparkContext): Configuration =
    sc.hadoopConfiguration

  implicit class Ops(val conf: HadoopConfiguration) extends AnyVal {
    def serializable: Configuration = conf
  }

  register(
    cls[conf.Configuration] → new WritableSerializer[conf.Configuration],
    cls[Configuration] → serializeAs[Configuration, conf.Configuration]
  )
} 
Example 21
Source File: SerializableSerializerTest.scala    From spark-util   with Apache License 2.0 5 votes vote down vote up
package org.hammerlab.hadoop.kryo

import java.io.{ ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream }

import com.esotericsoftware.kryo.Kryo
import com.esotericsoftware.kryo.io.{ Input, Output }
import org.hammerlab.test.Suite

class SerializableSerializerTest
  extends Suite {
  test("serde") {
    val kryo = new Kryo()
    kryo.setRegistrationRequired(true)
    val baos = new ByteArrayOutputStream()
    val output = new Output(baos)

    val foo = new Foo
    foo.n = 123
    foo.s = "abc"

    intercept[IllegalArgumentException] {
      kryo.writeClassAndObject(output, foo)
    }
    .getMessage should startWith("Class is not registered: org.hammerlab.hadoop.kryo.Foo")

    kryo.register(classOf[Foo], SerializableSerializer[Foo]())

    kryo.writeClassAndObject(output, foo)

    output.close()

    val bytes = baos.toByteArray
    bytes.length should be(93)

    val bais = new ByteArrayInputStream(bytes)

    val input = new Input(bais)
    val after = kryo.readClassAndObject(input).asInstanceOf[Foo]

    after.n should be(foo.n)
    after.s should be(foo.s)
  }
}

class Foo
  extends Serializable {

  var n = 0
  var s = ""

  private def writeObject(out: ObjectOutputStream): Unit = {
    out.writeInt(n)
    out.writeUTF(s)
  }

  private def readObject(in: ObjectInputStream): Unit = {
    n = in.readInt()
    s = in.readUTF()
  }
} 
Example 22
Source File: DesignSerializationTest.scala    From airframe   with Apache License 2.0 5 votes vote down vote up
package wvlet.airframe

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

import DesignTest._
import wvlet.airspec.AirSpec

object DesignSerializationTest {
  def serialize(d: Design): Array[Byte] = {
    val b  = new ByteArrayOutputStream()
    val oo = new ObjectOutputStream(b)
    oo.writeObject(d)
    oo.close()
    b.toByteArray
  }

  def deserialize(b: Array[Byte]): Design = {
    val in  = new ByteArrayInputStream(b)
    val oi  = new ObjectInputStream(in)
    val obj = oi.readObject().asInstanceOf[Design]
    obj.asInstanceOf[Design]
  }
}


class DesignSerializationTest extends AirSpec {
  import DesignSerializationTest._

  def `be serializable`: Unit = {
    val b   = serialize(d1)
    val d1s = deserialize(b)
    d1s shouldBe (d1)
  }

  def `serialize instance binding`: Unit = {
    val d  = Design.blanc.bind[Message].toInstance(Hello("world"))
    val b  = serialize(d)
    val ds = deserialize(b)
    ds shouldBe (d)
  }
} 
Example 23
Source File: SerializationTest.scala    From airframe   with Apache License 2.0 5 votes vote down vote up
package wvlet.log

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

import wvlet.log.io.IOUtil

object SerializationTest {
  trait A extends LogSupport {
    debug("new A")
    def hello = debug("hello")
  }
}


class SerializationTest extends Spec {
  import SerializationTest._

  def `logger should be serializable`: Unit = {
    val a = new A {}
    val b = new ByteArrayOutputStream()
    IOUtil.withResource(new ObjectOutputStream(b)) { out => out.writeObject(a) }
    val ser = b.toByteArray
    IOUtil.withResource(new ObjectInputStream(new ByteArrayInputStream(ser))) { in =>
      debug("deserialization")
      val a = in.readObject().asInstanceOf[A]
      a.hello
    }
  }
} 
Example 24
Source File: ObjectStreamUtil.scala    From milan   with Apache License 2.0 5 votes vote down vote up
package com.amazon.milan.compiler.flink.testutil

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

object ObjectStreamUtil {
  def serializeAndDeserialize[T](value: T): T = {
    val outputStream = new ByteArrayOutputStream()
    val objectOutputStream = new ObjectOutputStream(outputStream)
    objectOutputStream.writeObject(value)

    val bytes = outputStream.toByteArray
    val objectInputStream = new ObjectInputStream(new ByteArrayInputStream(bytes))
    objectInputStream.readObject().asInstanceOf[T]
  }
} 
Example 25
Source File: Inputs.scala    From perf_tester   with Apache License 2.0 5 votes vote down vote up
package org.perftester.process

import java.io.ObjectOutputStream

sealed trait Inputs {
  def writeTo(outputStream: ObjectOutputStream) = outputStream.synchronized {
    outputStream.writeObject(this)
  }
}

case class Run(className: String, args: Seq[String]) extends Inputs

case class ScalacGlobalConfig(id: String,
                              outputDirectory: Option[String],
                              classPath: Option[Seq[String]],
                              otherParams: Option[List[String]],
                              files: Option[List[String]])
    extends Inputs {
  override def toString =
    s"ScalacGlobalConfig[$id] ${option("outputDirectory", outputDirectory)}${option(
      "classPath",
      classPath)}${option("otherParams", otherParams)}${option("files", files)}"

  def option(name: String, value: Option[Any]) = value match {
    case None    => ""
    case Some(x) => s"$name=$x"
  }
}

case class ScalacRun(id: String) extends Inputs

case class ScalacRetire(id: String) extends Inputs

case object Gc extends Inputs

case object Exit extends Inputs 
Example 26
Source File: Rule.scala    From jigg   with Apache License 2.0 5 votes vote down vote up
package jigg.nlp.ccg.parser



import jigg.nlp.ccg.lexicon.{Category, Derivation, Point, UnaryChildPoint, BinaryChildrenPoints, AppliedRule}

import scala.collection.mutable.{HashMap, HashSet}
import java.io.{ObjectOutputStream, ObjectInputStream}

trait Rule {
  def unify(left:Category, right:Category): Option[Array[(Category, String)]]
  def raise(child:Category): Option[Array[(Category, String)]]
  def headFinder:HeadFinder
}

// rules are restricted to CFG rules extracted from the training CCGBank
case class CFGRule(val binaryRules:Map[(Int,Int), Array[(Category, String)]], // category ids -> (category, ruleType)
                   val unaryRules:Map[Int, Array[(Category, String)]],
                   override val headFinder:HeadFinder) extends Rule {
  def unify(left:Category, right:Category):Option[Array[(Category, String)]] = binaryRules.get((left.id, right.id))
  def raise(child:Category):Option[Array[(Category, String)]] = unaryRules.get(child.id)
}

object CFGRule {
  def extractRulesFromDerivations(derivations: Array[Derivation], headFinder:HeadFinder): CFGRule = {
    val binaryRules = new HashMap[(Int, Int), HashSet[(Category, String)]]
    val unaryRules = new HashMap[Int, HashSet[(Category, String)]]

    derivations.foreach { deriv =>
      deriv.foreachPoint({ point:Point => deriv.get(point) match {
        case Some(AppliedRule(UnaryChildPoint(child), ruleType)) =>
          val parents = unaryRules.getOrElseUpdate(child.category.id, new HashSet[(Category, String)])
          parents += ((point.category, ruleType))
        case Some(AppliedRule(BinaryChildrenPoints(left, right), ruleType)) =>
          val parents = binaryRules.getOrElseUpdate((left.category.id, right.category.id), new HashSet[(Category, String)])
          parents += ((point.category, ruleType))
        case _ =>
      }})
    }
    new CFGRule(binaryRules.map { case (k, v) => k -> v.toArray }.toMap,
                unaryRules.map { case (k, v) => k -> v.toArray }.toMap,
                headFinder)
  }
} 
Example 27
Source File: HadoopCheckpointStoreFactory.scala    From incubator-retired-gearpump   with Apache License 2.0 5 votes vote down vote up
package org.apache.gearpump.streaming.hadoop

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

import org.apache.gearpump.streaming.hadoop.lib.HadoopUtil
import org.apache.gearpump.streaming.hadoop.lib.rotation.{FileSizeRotation, Rotation}
import org.apache.gearpump.streaming.transaction.api.{CheckpointStore, CheckpointStoreFactory}

object HadoopCheckpointStoreFactory {
  val VERSION = 1
}

class HadoopCheckpointStoreFactory(
    dir: String,
    @transient private var hadoopConfig: Configuration,
    rotation: Rotation = new FileSizeRotation(128 * Math.pow(2, 20).toLong))
  extends CheckpointStoreFactory {
  import org.apache.gearpump.streaming.hadoop.HadoopCheckpointStoreFactory._

  
  private def readObject(in: ObjectInputStream): Unit = {
    in.defaultReadObject()
    hadoopConfig = new Configuration(false)
    hadoopConfig.readFields(in)
  }

  override def getCheckpointStore(name: String): CheckpointStore = {
    val dirPath = new Path(dir + Path.SEPARATOR + s"v$VERSION", name)
    val fs = HadoopUtil.getFileSystemForPath(dirPath, hadoopConfig)
    new HadoopCheckpointStore(dirPath, fs, hadoopConfig, rotation)
  }
} 
Example 28
Source File: QueueInputDStream.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.dstream

import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream}

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.{StreamingContext, Time}

private[streaming]
class QueueInputDStream[T: ClassTag](
    ssc: StreamingContext,
    val queue: Queue[RDD[T]],
    oneAtATime: Boolean,
    defaultRDD: RDD[T]
  ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    throw new NotSerializableException("queueStream doesn't support checkpointing. " +
      "Please don't use queueStream when checkpointing is enabled.")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    queue.synchronized {
      if (oneAtATime && queue.nonEmpty) {
        buffer += queue.dequeue()
      } else {
        buffer ++= queue
        queue.clear()
      }
    }
    if (buffer.nonEmpty) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(context.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      Some(ssc.sparkContext.emptyRDD)
    }
  }

} 
Example 29
Source File: SerializableConfiguration.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.conf.Configuration

private[spark]
class SerializableConfiguration(@transient var value: Configuration) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new Configuration(false)
    value.readFields(in)
  }
} 
Example 30
Source File: SerializableJobConf.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.mapred.JobConf

private[spark]
class SerializableJobConf(@transient var value: JobConf) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new JobConf(false)
    value.readFields(in)
  }
} 
Example 31
Source File: SerializableBuffer.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
} 
Example 32
Source File: CartesianRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.util.Utils

private[spark]
class CartesianPartition(
    idx: Int,
    @transient private val rdd1: RDD[_],
    @transient private val rdd2: RDD[_],
    s1Index: Int,
    s2Index: Int
  ) extends Partition {
  var s1 = rdd1.partitions(s1Index)
  var s2 = rdd2.partitions(s2Index)
  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    s1 = rdd1.partitions(s1Index)
    s2 = rdd2.partitions(s2Index)
    oos.defaultWriteObject()
  }
}

private[spark]
class CartesianRDD[T: ClassTag, U: ClassTag](
    sc: SparkContext,
    var rdd1 : RDD[T],
    var rdd2 : RDD[U])
  extends RDD[(T, U)](sc, Nil)
  with Serializable {

  val numPartitionsInRdd2 = rdd2.partitions.length

  override def getPartitions: Array[Partition] = {
    // create the cross product split
    val array = new Array[Partition](rdd1.partitions.length * rdd2.partitions.length)
    for (s1 <- rdd1.partitions; s2 <- rdd2.partitions) {
      val idx = s1.index * numPartitionsInRdd2 + s2.index
      array(idx) = new CartesianPartition(idx, rdd1, rdd2, s1.index, s2.index)
    }
    array
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    (rdd1.preferredLocations(currSplit.s1) ++ rdd2.preferredLocations(currSplit.s2)).distinct
  }

  override def compute(split: Partition, context: TaskContext): Iterator[(T, U)] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    for (x <- rdd1.iterator(currSplit.s1, context);
         y <- rdd2.iterator(currSplit.s2, context)) yield (x, y)
  }

  override def getDependencies: Seq[Dependency[_]] = List(
    new NarrowDependency(rdd1) {
      def getParents(id: Int): Seq[Int] = List(id / numPartitionsInRdd2)
    },
    new NarrowDependency(rdd2) {
      def getParents(id: Int): Seq[Int] = List(id % numPartitionsInRdd2)
    }
  )

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
  }
} 
Example 33
Source File: UnionRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 34
Source File: PartitionerAwareUnionRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.nonEmpty)
  require(rdds.forall(_.partitioner.isDefined))
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map { index =>
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }.toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) =>
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
    }
    val location = if (locations.isEmpty) {
      None
    } else {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 35
Source File: exercise10.scala    From scala-for-the-Impatient   with MIT License 5 votes vote down vote up
import collection.mutable.ArrayBuffer
import java.io.{ObjectInputStream, FileOutputStream, FileInputStream, ObjectOutputStream}

class Person(var name:String) extends Serializable{

  val friends = new ArrayBuffer[Person]()

  def addFriend(friend : Person){
    friends += friend
  }

  override def toString() = {
    var str = "My name is " + name + " and my friends name is "
    friends.foreach(str += _.name + ",")
    str
  }
}


object Test extends App{
  val p1 = new Person("Ivan")
  val p2 = new Person("F2")
  val p3 = new Person("F3")

  p1.addFriend(p2)
  p1.addFriend(p3)
  println(p1)

  val out = new ObjectOutputStream(new FileOutputStream("person.obj"))
  out.writeObject(p1)
  out.close()

  val in =  new ObjectInputStream(new FileInputStream("person.obj"))
  val p = in.readObject().asInstanceOf[Person]
  println(p)
} 
Example 36
Source File: FileUtils.scala    From eidos   with Apache License 2.0 5 votes vote down vote up
package org.clulab.wm.wmexchanger.utils

import java.io.BufferedInputStream
import java.io.BufferedOutputStream
import java.io.File
import java.io.FileInputStream
import java.io.FileOutputStream
import java.io.FilenameFilter
import java.io.ObjectInputStream
import java.io.ObjectOutputStream
import java.io.PrintWriter

import org.clulab.wm.wmexchanger.utils.Closer.AutoCloser

import scala.io.Source

object FileUtils {

  def appendingPrintWriterFromFile(file: File): PrintWriter = Sinker.printWriterFromFile(file, append = true)

  def appendingPrintWriterFromFile(path: String): PrintWriter = Sinker.printWriterFromFile(path, append = true)

  def printWriterFromFile(file: File): PrintWriter = Sinker.printWriterFromFile(file, append = false)

  def printWriterFromFile(path: String): PrintWriter = Sinker.printWriterFromFile(path, append = false)

  // Output
  def newBufferedOutputStream(file: File): BufferedOutputStream =
    new BufferedOutputStream(new FileOutputStream(file))

  def newBufferedOutputStream(filename: String): BufferedOutputStream =
    newBufferedOutputStream(new File(filename))

  def newAppendingBufferedOutputStream(file: File): BufferedOutputStream =
    new BufferedOutputStream(new FileOutputStream(file, true))

  def newAppendingBufferedOutputStream(filename: String): BufferedOutputStream =
    newAppendingBufferedOutputStream(new File(filename))

  def newObjectOutputStream(filename: String): ObjectOutputStream =
    new ObjectOutputStream(newBufferedOutputStream(filename))

  // Input
  def newBufferedInputStream(file: File): BufferedInputStream =
    new BufferedInputStream(new FileInputStream(file))

  def newBufferedInputStream(filename: String): BufferedInputStream =
    newBufferedInputStream(new File(filename))

  def newObjectInputStream(filename: String): ObjectInputStream =
    new ObjectInputStream(newBufferedInputStream(filename))

  def findFiles(collectionDir: String, extension: String): Seq[File] = {
    val dir = new File(collectionDir)
    val filter = new FilenameFilter {
      def accept(dir: File, name: String): Boolean = name.endsWith(extension)
    }

    val result = Option(dir.listFiles(filter))
        .getOrElse(throw Sourcer.newFileNotFoundException(collectionDir))
    result
  }

  protected def getTextFromSource(source: Source): String = source.mkString

  def getTextFromFile(file: File): String =
    Sourcer.sourceFromFile(file).autoClose { source =>
      getTextFromSource(source)
    }
} 
Example 37
Source File: TestObjSerialization.scala    From eidos   with Apache License 2.0 5 votes vote down vote up
package org.clulab.wm.eidos.serialization.obj

import java.io.ByteArrayOutputStream
import java.io.ObjectOutputStream

import org.clulab.odin.EventMention
import org.clulab.odin.Mention
import org.clulab.odin.TextBoundMention
import org.clulab.wm.eidos.EidosSystem
import org.clulab.wm.eidos.test.TestUtils.Test
import org.clulab.wm.eidos.utils.Closer.AutoCloser
import org.clulab.wm.eidos.utils.FileUtils

class TestSerialization extends Test {
  val config = this.defaultConfig
  val reader = new EidosSystem(config)

  def serialize(original: Any, index: Int): Unit = {
    val copy = (new ByteArrayOutputStream()).autoClose { streamOut =>
      (new ObjectOutputStream(streamOut)).autoClose { encoder =>
        encoder.writeObject(original)
      }

      val bytes = streamOut.toByteArray

      FileUtils.load[Any](bytes, this)
    }

    if (original.isInstanceOf[Mention])
      require(original == copy)
  }
  
  behavior of "Standard Serializer"

  it should "serialize and deserialize mentions" in {
    val text = "Water trucking has decreased due to the cost of fuel last week." // "last week" added for time
    //val text = "Food shortages cause hunger."
    val annotatedDocument = reader.extractFromText(text)
    val mentionsOut = annotatedDocument.odinMentions

    mentionsOut.foreach {
        case eventMention: EventMention =>
          var index = 0 // test

          serialize(eventMention.labels, index)
          index += 1
          serialize(eventMention.tokenInterval, index)
          index += 1
          serialize(eventMention.trigger, index)
          index += 1
          serialize(eventMention.arguments, index)
          index += 1
          serialize(eventMention.paths, index)
          index += 1
          serialize(eventMention.sentence, index)
          index += 1
          serialize(eventMention.document, index)
          index += 1
          serialize(eventMention.keep, index)
          index += 1
          serialize(eventMention.foundBy, index)

          eventMention.attachments.foreach { attachment =>
            serialize(attachment, index); index += 1
          }

          serialize(eventMention.attachments, index)
          index += 1
          serialize(eventMention, index)
          index += 1
        case textBoundMention: TextBoundMention =>
          var index = 0 // test

          textBoundMention.attachments.foreach { attachment =>
            serialize(attachment, index); index += 1
          }

          serialize(textBoundMention.attachments, index)
          index += 1
          val hash = textBoundMention.hashCode
          val mention2 = textBoundMention.copy()
          val hash2 = mention2.hashCode
          serialize(mention2, index)
          index += 1
    }
  }
} 
Example 38
Source File: SerializableBuffer.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
} 
Example 39
Source File: CartesianRDD.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.util.Utils

private[spark]
class CartesianPartition(
    idx: Int,
    @transient rdd1: RDD[_],
    @transient rdd2: RDD[_],
    s1Index: Int,
    s2Index: Int
  ) extends Partition {
  var s1 = rdd1.partitions(s1Index)
  var s2 = rdd2.partitions(s2Index)
  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    s1 = rdd1.partitions(s1Index)
    s2 = rdd2.partitions(s2Index)
    oos.defaultWriteObject()
  }
}

private[spark]
class CartesianRDD[T: ClassTag, U: ClassTag](
    sc: SparkContext,
    var rdd1 : RDD[T],
    var rdd2 : RDD[U])
  extends RDD[Pair[T, U]](sc, Nil)
  with Serializable {

  val numPartitionsInRdd2 = rdd2.partitions.size

  override def getPartitions: Array[Partition] = {
    // create the cross product split
    val array = new Array[Partition](rdd1.partitions.size * rdd2.partitions.size)
    for (s1 <- rdd1.partitions; s2 <- rdd2.partitions) {
      val idx = s1.index * numPartitionsInRdd2 + s2.index
      array(idx) = new CartesianPartition(idx, rdd1, rdd2, s1.index, s2.index)
    }
    array
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    (rdd1.preferredLocations(currSplit.s1) ++ rdd2.preferredLocations(currSplit.s2)).distinct
  }

  override def compute(split: Partition, context: TaskContext) = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    for (x <- rdd1.iterator(currSplit.s1, context);
         y <- rdd2.iterator(currSplit.s2, context)) yield (x, y)
  }

  override def getDependencies: Seq[Dependency[_]] = List(
    new NarrowDependency(rdd1) {
      def getParents(id: Int): Seq[Int] = List(id / numPartitionsInRdd2)
    },
    new NarrowDependency(rdd2) {
      def getParents(id: Int): Seq[Int] = List(id % numPartitionsInRdd2)
    }
  )

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
  }
} 
Example 40
Source File: UnionRDD.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient rdd: RDD[T],
    val parentRddIndex: Int,
    @transient parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations() = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdds.map(_.partitions.size).sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.size)
      pos += rdd.partitions.size
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 41
Source File: PartitionerAwareUnionRDD.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.length > 0)
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map(index => {
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }).toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) => {
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
      }
    }
    val location = if (locations.isEmpty) {
      None
    } else  {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 42
Source File: NotSerializableFakeTask.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io.{ObjectInputStream, ObjectOutputStream, IOException}

import org.apache.spark.TaskContext


private[spark] class NotSerializableFakeTask(myId: Int, stageId: Int) extends Task[Array[Byte]](stageId, 0) {
  override def runTask(context: TaskContext): Array[Byte] = Array.empty[Byte]
  override def preferredLocations: Seq[TaskLocation] = Seq[TaskLocation]()

  @throws(classOf[IOException])
  private def writeObject(out: ObjectOutputStream): Unit = {
    if (stageId == 0) {
      throw new IllegalStateException("Cannot serialize")
    }
  }

  @throws(classOf[IOException])
  private def readObject(in: ObjectInputStream): Unit = {}
} 
Example 43
Source File: SerializableUtils.scala    From protobuf-generic   with Apache License 2.0 5 votes vote down vote up
package me.lyh.protobuf.generic.test

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

object SerializableUtils {
  private def serializeToByteArray(value: Serializable): Array[Byte] = {
    val buffer = new ByteArrayOutputStream()
    val oos = new ObjectOutputStream(buffer)
    oos.writeObject(value)
    buffer.toByteArray
  }

  private def deserializeFromByteArray(encodedValue: Array[Byte]): AnyRef = {
    val ois = new ObjectInputStream(new ByteArrayInputStream(encodedValue))
    ois.readObject()
  }

  def ensureSerializable[T <: Serializable](value: T): T =
    deserializeFromByteArray(serializeToByteArray(value)).asInstanceOf[T]
} 
Example 44
Source File: SerializableUtils.scala    From protobuf-generic   with Apache License 2.0 5 votes vote down vote up
package me.lyh.protobuf.generic.test

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

object SerializableUtils {
  private def serializeToByteArray(value: Serializable): Array[Byte] = {
    val buffer = new ByteArrayOutputStream()
    val oos = new ObjectOutputStream(buffer)
    oos.writeObject(value)
    buffer.toByteArray
  }

  private def deserializeFromByteArray(encodedValue: Array[Byte]): AnyRef = {
    val ois = new ObjectInputStream(new ByteArrayInputStream(encodedValue))
    ois.readObject()
  }

  def ensureSerializable[T <: Serializable](value: T): T =
    deserializeFromByteArray(serializeToByteArray(value)).asInstanceOf[T]
} 
Example 45
Source File: GenericReader.scala    From protobuf-generic   with Apache License 2.0 5 votes vote down vote up
package me.lyh.protobuf.generic

import java.io.{InputStream, ObjectInputStream, ObjectOutputStream, OutputStream}
import java.nio.ByteBuffer
import java.util.{ArrayList => JArrayList, LinkedHashMap => JLinkedHashMap, TreeMap => JTreeMap}

import com.google.protobuf.Descriptors.FieldDescriptor.Type
import com.google.protobuf.{CodedInputStream, WireFormat}

import scala.collection.JavaConverters._

object GenericReader {
  def of(schema: Schema): GenericReader = new GenericReader(schema)
}

class GenericReader(val schema: Schema) extends Serializable {
  def read(buf: Array[Byte]): GenericRecord =
    read(CodedInputStream.newInstance(buf), schema.root)

  def read(buf: ByteBuffer): GenericRecord =
    read(CodedInputStream.newInstance(buf), schema.root)

  def read(input: InputStream): GenericRecord =
    read(CodedInputStream.newInstance(input), schema.root)

  private def read(input: CodedInputStream, messageSchema: MessageSchema): GenericRecord = {
    val map = new JTreeMap[java.lang.Integer, Any]()
    while (!input.isAtEnd) {
      val tag = input.readTag()
      val id = WireFormat.getTagFieldNumber(tag)
      val field = messageSchema.fields(id)

      if (field.label == Label.REPEATED) {
        if (!map.containsKey(id)) {
          map.put(id, new JArrayList[Any]())
        }
        val list = map.get(id).asInstanceOf[java.util.ArrayList[Any]]
        if (field.packed) {
          val bytesIn = CodedInputStream.newInstance(input.readByteBuffer())
          while (!bytesIn.isAtEnd) {
            list.add(readValue(bytesIn, field))
          }
        } else {
          list.add(readValue(input, field))
        }
      } else {
        map.put(id, readValue(input, field))
      }
    }

    val result = new JLinkedHashMap[String, Any]()
    map.asScala.foreach(kv => result.put(messageSchema.fields(kv._1).name, kv._2))
    messageSchema.fields.valuesIterator.foreach { f =>
      if (f.default.isDefined && !result.containsKey(f.name)) {
        result.put(f.name, f.default.get)
      }
    }
    result
  }

  private def readValue(in: CodedInputStream, field: Field): Any = field.`type` match {
    case Type.FLOAT    => in.readFloat()
    case Type.DOUBLE   => in.readDouble()
    case Type.FIXED32  => in.readFixed32()
    case Type.FIXED64  => in.readFixed64()
    case Type.INT32    => in.readInt32()
    case Type.INT64    => in.readInt64()
    case Type.UINT32   => in.readUInt32()
    case Type.UINT64   => in.readUInt64()
    case Type.SFIXED32 => in.readSFixed32()
    case Type.SFIXED64 => in.readSFixed64()
    case Type.SINT32   => in.readSInt32()
    case Type.SINT64   => in.readSInt64()
    case Type.BOOL     => in.readBool()
    case Type.STRING   => in.readString()
    case Type.BYTES    => Base64.encode(in.readByteArray())
    case Type.ENUM     => schema.enums(field.schema.get).values(in.readEnum())
    case Type.MESSAGE =>
      val nestedIn = CodedInputStream.newInstance(in.readByteBuffer())
      read(nestedIn, schema.messages(field.schema.get))
    case Type.GROUP => throw new IllegalArgumentException("Unsupported type: GROUP")
  }

  private def readObject(in: ObjectInputStream): Unit = {
    val schema = Schema.fromJson(in.readUTF())

    val schemaField = getClass.getDeclaredField("schema")
    schemaField.setAccessible(true)
    schemaField.set(this, schema)
  }

  private def writeObject(out: ObjectOutputStream): Unit =
    out.writeUTF(schema.toJson)
} 
Example 46
Source File: TemplateSpec.scala    From cluster-broccoli   with Apache License 2.0 5 votes vote down vote up
package de.frosner.broccoli.models

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

import org.specs2.mutable.Specification
import play.api.libs.json.Json

import Template.{templateApiWrites, templatePersistenceReads}

class TemplateSpec extends Specification {

  "A template" should {

    "extract only parameters specified in the parameters" in {
      Template("test",
               "Hallo {{id}}. I like {{person_name}}.",
               "desc",
               Map("id" -> ParameterInfo("id", None, None, None, ParameterType.Raw, None))).parameters === Set("id")
    }

    "not automatically extract parameters from a template" in {
      Template("test", "Hallo {{id}}, how is {{object}}", "desc", Map.empty).parameters === Set.empty
    }

    "create the template version correctly in" in {
      Template("test", "template JSON", "desc", Map.empty).version === "889df4c8118c30a28ed4f51674a0f19d"
    }

    "result in different template versions if the template JSON differs" in {
      Template("test", "template JSON", "desc", Map.empty).version !== Template("test",
                                                                                "template JSONs",
                                                                                "desc",
                                                                                Map.empty).version
    }

    "result in different template versions if the template parameter info differs" in {
      Template(
        id = "test",
        template = "template JSON {{id}}",
        description = "desc",
        parameterInfos = Map.empty
      ).version !== Template(
        id = "test",
        template = "template JSON {{id}}",
        description = "desc",
        parameterInfos = Map(
          "id" -> ParameterInfo("id",
                                None,
                                None,
                                secret = Some(false),
                                `type` = ParameterType.String,
                                orderIndex = None)
        )
      ).version
    }

  }

  "Template serialization" should {

    "work correctly" in {
      val originalTemplate = Template("test", "Hallo {{name}}", "desc", Map.empty)
      val bos = new ByteArrayOutputStream()
      val oos = new ObjectOutputStream(bos)
      oos.writeObject(originalTemplate)
      oos.close()

      val ois = new ObjectInputStream(new ByteArrayInputStream(bos.toByteArray))
      val deserializedTemplate = ois.readObject()
      ois.close()

      originalTemplate === deserializedTemplate
    }

  }

  "Template back-end JSON serialization" should {

    "work" in {
      val template = Template(
        id = "t",
        template = "{{id}}",
        description = "d",
        parameterInfos = Map.empty
      )
      Json
        .fromJson(Json.toJson(template)(Template.templatePersistenceWrites))(Template.templatePersistenceReads)
        .get === template
    }

  }

} 
Example 47
Source File: JavaSerializationBenchmark.scala    From scala-commons   with MIT License 5 votes vote down vote up
package com.avsystem.commons
package rpc.akka.serialization

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

import org.openjdk.jmh.annotations.{Benchmark, BenchmarkMode, Fork, Measurement, Mode, Scope, State, Warmup}
import org.openjdk.jmh.infra.Blackhole


@Warmup(iterations = 5)
@Measurement(iterations = 20)
@Fork(1)
@BenchmarkMode(Array(Mode.Throughput))
@State(Scope.Thread)
class JavaSerializationBenchmark {

  val something = Something(42, Nested(4 :: 8 :: 15 :: 16 :: 23 :: 42 :: Nil, 0), "lol")
  val array = {
    val baos = new ByteArrayOutputStream()
    val o = new ObjectOutputStream(baos)

    o.writeObject(something)
    o.close()

    baos.toByteArray
  }

  @Benchmark
  def byteStringOutput(): Something = {
    val baos = new ByteArrayOutputStream()
    val o = new ObjectOutputStream(baos)

    o.writeObject(something)
    o.close()

    val array = baos.toByteArray

    new ObjectInputStream(new ByteArrayInputStream(array)).readObject().asInstanceOf[Something]
  }

  @Benchmark
  def writeTest(): Array[Byte] = {
    val baos = new ByteArrayOutputStream()
    val o = new ObjectOutputStream(baos)

    o.writeObject(something)
    o.close()

    baos.toByteArray
  }

  @Benchmark
  def readTest(): Something = {
    new ObjectInputStream(new ByteArrayInputStream(array)).readObject().asInstanceOf[Something]
  }
} 
Example 48
Source File: MessageSerializationSuite.scala    From bahir   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.pubnub

import java.io.ByteArrayInputStream
import java.io.ByteArrayOutputStream
import java.io.ObjectInputStream
import java.io.ObjectOutputStream

import com.google.gson.JsonParser
import com.pubnub.api.models.consumer.pubsub.PNMessageResult

import org.apache.spark.SparkFunSuite

class MessageSerializationSuite extends SparkFunSuite {
  test("Full example") {
    checkMessageSerialization(
      "{\"message\":\"Hello, World!\"}", "channel1",
      "publisher1", "subscription1", System.currentTimeMillis * 10000
    )
  }

  test("Message from channel") {
    checkMessageSerialization("{\"message\":\"Hello, World!\"}", "c", "p", null, 13534398158620385L)
  }

  test("Message from subscription") {
    checkMessageSerialization("{\"message\":\"Hello, World!\"}", null, "p", "s", 13534397812467596L)
  }

  def checkMessageSerialization(payload: String, channel: String,
      publisher: String, subscription: String, timestamp: Long): Unit = {
    val builder = PNMessageResult.builder
      .message(if (payload != null) new JsonParser().parse(payload) else null)
      .channel(channel)
      .publisher(publisher)
      .subscription(subscription)
      .timetoken(timestamp)
    val pubNubMessage = builder.build()
    val sparkMessage = new SparkPubNubMessage
    sparkMessage.message = pubNubMessage

    // serializer
    val byteOutStream = new ByteArrayOutputStream
    val outputStream = new ObjectOutputStream(byteOutStream)
    outputStream.writeObject(sparkMessage)
    outputStream.flush()
    outputStream.close()
    byteOutStream.close()
    val serializedBytes = byteOutStream.toByteArray

    // deserialize
    val byteInStream = new ByteArrayInputStream(serializedBytes)
    val inputStream = new ObjectInputStream(byteInStream)
    val deserializedMessage = inputStream.readObject().asInstanceOf[SparkPubNubMessage]
    inputStream.close()
    byteInStream.close()

    assert(payload.equals(deserializedMessage.getPayload))
    if (channel != null) {
      assert(channel.equals(deserializedMessage.getChannel))
    } else {
      assert(deserializedMessage.getChannel == null)
    }
    if (subscription != null) {
      assert(subscription.equals(deserializedMessage.getSubscription))
    } else {
      assert(deserializedMessage.getSubscription == null)
    }
    assert(publisher.equals(deserializedMessage.getPublisher))
    val unixTimestamp = Math.ceil(timestamp / 10000).longValue()
    assert(unixTimestamp.equals(deserializedMessage.getTimestamp))
  }
} 
Example 49
Source File: TestSerializationAndLazy.scala    From incubator-daffodil   with Apache License 2.0 5 votes vote down vote up
package org.apache.daffodil.util

import org.junit.Assert._
import java.io.ByteArrayOutputStream
import java.io.ObjectOutputStream
import java.io.ByteArrayInputStream
import java.io.ObjectInputStream
import org.junit.Test

class ToSerialize extends Serializable {

  val v = 5
  var lazyValWasEvaluated = false
  lazy val x = {
    // println("v is " + v)
    lazyValWasEvaluated = true
    2 * v
  }

}


class TestSerializationAndLazy {

  @Test
  def testSerializeBeforeLazyEval(): Unit = {
    val instance = new ToSerialize
    val baos = new ByteArrayOutputStream
    val stream = new ObjectOutputStream(baos)
    stream.writeObject(instance)
    stream.flush()
    stream.close()
    assertFalse(instance.lazyValWasEvaluated)
    val ba = baos.toByteArray()
    val bais = new ByteArrayInputStream(ba)
    val istream = new ObjectInputStream(bais)
    val restoredInstance = istream.readObject()
    istream.close()
    assertTrue(restoredInstance.isInstanceOf[ToSerialize])
    val ts = restoredInstance.asInstanceOf[ToSerialize]
    assertFalse(ts.lazyValWasEvaluated)
    ts.x
    assertTrue(ts.lazyValWasEvaluated)
  }

} 
Example 50
Source File: TMNodesWriter.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators.btm

import java.io.{ByteArrayOutputStream, ObjectOutputStream}

import com.johnsnowlabs.storage.{RocksDBConnection, StorageBatchWriter}

class TMNodesWriter(
                    override protected val connection: RocksDBConnection
                  ) extends StorageBatchWriter[TrieNode] {

  def toBytes(content: TrieNode): Array[Byte] = {
    val stream: ByteArrayOutputStream = new ByteArrayOutputStream()
    val oos = new ObjectOutputStream(stream)
    oos.writeObject(content)
    oos.close()
    stream.toByteArray
  }

  def add(word: Int, value: TrieNode): Unit = {
    super.add(word.toString, value)
  }

  override protected def writeBufferSize: Int = 10000
} 
Example 51
Source File: SerializableHadoopConfiguration.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.utils

import java.io.{ObjectInputStream, ObjectOutputStream, Serializable}

import org.apache.hadoop

class SerializableHadoopConfiguration(@transient var value: hadoop.conf.Configuration) extends Serializable {
  private def writeObject(out: ObjectOutputStream) {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream) {
    value = new hadoop.conf.Configuration(false)
    value.readFields(in)
  }
} 
Example 52
Source File: SpillingCollectIterator.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.utils

import java.io.{ObjectInputStream, ObjectOutputStream}

import is.hail.backend.spark.SparkBackend
import is.hail.expr.ir.ExecuteContext
import is.hail.io.fs.FS
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag
import scala.reflect.classTag

object SpillingCollectIterator {
  def apply[T: ClassTag](localTmpdir: String, fs: FS, rdd: RDD[T], sizeLimit: Int): SpillingCollectIterator[T] = {
    val nPartitions = rdd.partitions.length
    val x = new SpillingCollectIterator(localTmpdir, fs, nPartitions, sizeLimit)
    val ctc = classTag[T]
    SparkBackend.sparkContext("SpillingCollectIterator.apply").runJob(
      rdd,
      (_, it: Iterator[T]) => it.toArray(ctc),
      0 until nPartitions,
      x.append _)
    x
  }
}

class SpillingCollectIterator[T: ClassTag] private (localTmpdir: String, fs: FS, nPartitions: Int, sizeLimit: Int) extends Iterator[T] {
  private[this] val files: Array[(String, Long)] = new Array(nPartitions)
  private[this] val buf: Array[Array[T]] = new Array(nPartitions)
  private[this] var _size: Long = 0L
  private[this] var i: Int = 0
  private[this] var it: Iterator[T] = null

  private def append(partition: Int, a: Array[T]): Unit = synchronized {
    assert(buf(partition) == null)
    buf(partition) = a
    _size += a.length
    if (_size > sizeLimit) {
      val file = ExecuteContext.createTmpPathNoCleanup(localTmpdir, s"spilling-collect-iterator-$partition")
      using(fs.createNoCompression(file)) { os =>
        var k = 0
        while (k < buf.length) {
          val vals = buf(k)
          if (vals != null) {
            buf(k) = null
            val pos = os.getPosition
            val oos = new ObjectOutputStream(os)
            oos.writeInt(vals.length)
            var j = 0
            while (j < vals.length) {
              oos.writeObject(vals(j))
              j += 1
            }
            files(k) = (file, pos)
            oos.flush()
          }
          k += 1
        }
      }
      _size = 0
    }
  }

  def hasNext: Boolean = {
    if (it == null || !it.hasNext) {
      if (i >= files.length) {
        it = null
        return false
      } else if (files(i) == null) {
        assert(buf(i) != null)
        it = buf(i).iterator
        buf(i) = null
      } else {
        val (filename, pos) = files(i)
        using(fs.openNoCompression(filename)) { is =>
          is.seek(pos)
          using(new ObjectInputStream(is)) { ois =>
            val length = ois.readInt()
            val arr = new Array[T](length)
            var j = 0
            while (j < length) {
              arr(j) = ois.readObject().asInstanceOf[T]
              j += 1
            }
            it = arr.iterator
          }
        }
      }
      i += 1
    }
    it.hasNext
  }

  def next: T = {
    hasNext
    it.next
  }
} 
Example 53
Source File: QueueInputDStream.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.dstream

import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream}

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.{StreamingContext, Time}

private[streaming]
class QueueInputDStream[T: ClassTag](
    ssc: StreamingContext,
    val queue: Queue[RDD[T]],
    oneAtATime: Boolean,
    defaultRDD: RDD[T]
  ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    throw new NotSerializableException("queueStream doesn't support checkpointing. " +
      "Please don't use queueStream when checkpointing is enabled.")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    queue.synchronized {
      if (oneAtATime && queue.nonEmpty) {
        buffer += queue.dequeue()
      } else {
        buffer ++= queue
        queue.clear()
      }
    }
    if (buffer.nonEmpty) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(context.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      Some(ssc.sparkContext.emptyRDD)
    }
  }

} 
Example 54
Source File: SerializableConfiguration.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.conf.Configuration

private[spark]
class SerializableConfiguration(@transient var value: Configuration) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new Configuration(false)
    value.readFields(in)
  }
} 
Example 55
Source File: SerializableJobConf.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.mapred.JobConf

private[spark]
class SerializableJobConf(@transient var value: JobConf) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new JobConf(false)
    value.readFields(in)
  }
} 
Example 56
Source File: SerializableBuffer.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
} 
Example 57
Source File: CartesianRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.util.Utils

private[spark]
class CartesianPartition(
    idx: Int,
    @transient private val rdd1: RDD[_],
    @transient private val rdd2: RDD[_],
    s1Index: Int,
    s2Index: Int
  ) extends Partition {
  var s1 = rdd1.partitions(s1Index)
  var s2 = rdd2.partitions(s2Index)
  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    s1 = rdd1.partitions(s1Index)
    s2 = rdd2.partitions(s2Index)
    oos.defaultWriteObject()
  }
}

private[spark]
class CartesianRDD[T: ClassTag, U: ClassTag](
    sc: SparkContext,
    var rdd1 : RDD[T],
    var rdd2 : RDD[U])
  extends RDD[(T, U)](sc, Nil)
  with Serializable {

  val numPartitionsInRdd2 = rdd2.partitions.length

  override def getPartitions: Array[Partition] = {
    // create the cross product split
    val array = new Array[Partition](rdd1.partitions.length * rdd2.partitions.length)
    for (s1 <- rdd1.partitions; s2 <- rdd2.partitions) {
      val idx = s1.index * numPartitionsInRdd2 + s2.index
      array(idx) = new CartesianPartition(idx, rdd1, rdd2, s1.index, s2.index)
    }
    array
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    (rdd1.preferredLocations(currSplit.s1) ++ rdd2.preferredLocations(currSplit.s2)).distinct
  }

  override def compute(split: Partition, context: TaskContext): Iterator[(T, U)] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    for (x <- rdd1.iterator(currSplit.s1, context);
         y <- rdd2.iterator(currSplit.s2, context)) yield (x, y)
  }

  override def getDependencies: Seq[Dependency[_]] = List(
    new NarrowDependency(rdd1) {
      def getParents(id: Int): Seq[Int] = List(id / numPartitionsInRdd2)
    },
    new NarrowDependency(rdd2) {
      def getParents(id: Int): Seq[Int] = List(id % numPartitionsInRdd2)
    }
  )

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
  }
} 
Example 58
Source File: UnionRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 59
Source File: PartitionerAwareUnionRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.nonEmpty)
  require(rdds.forall(_.partitioner.isDefined))
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map { index =>
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }.toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) =>
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
    }
    val location = if (locations.isEmpty) {
      None
    } else {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 60
Source File: MqttConfig.scala    From akka-iot-mqtt-v2   with GNU Lesser General Public License v3.0 5 votes vote down vote up
package akkaiot

import scala.concurrent.duration._

import java.io.Serializable
import java.io.ByteArrayInputStream
import java.io.ByteArrayOutputStream
import java.io.ObjectInputStream
import java.io.ObjectOutputStream

import com.sandinh.paho.akka._
import com.sandinh.paho.akka.MqttPubSub._

object MqttConfig {
  val topic = "akka-iot-mqtt-topic"

  // Pub-Sub config
  val psConfig = PSConfig(
    brokerUrl = "tcp://test.mosquitto.org:1883",
    userName = null,
    password = null,
    stashTimeToLive = 1.minute,
    stashCapacity = 8000,
    reconnectDelayMin = 10.millis,
    reconnectDelayMax = 30.seconds,
    cleanSession = false
  )

  // Serialize object to byte array
  def writeToByteArray(obj: Any): Array[Byte] = {
    val baos = new ByteArrayOutputStream
    val oos = new ObjectOutputStream(baos)
    try {
      oos.writeObject(obj)
      baos.toByteArray
    } finally {
      try {
        oos.close
      } catch {
        case _: Throwable => // Do nothing
      }
    }
  }

  // Deserialize object from byte array
  def readFromByteArray[A](bytes: Array[Byte]): A = {
    val bais = new ByteArrayInputStream(bytes)
    val ois = new ObjectInputStream(bais)
    try {
      val obj = ois.readObject
      obj.asInstanceOf[A]
    } finally {
      try {
        ois.close
      } catch {
        case _: Throwable => // Do nothing
      }
    }
  }
} 
Example 61
Source File: QueueInputDStream.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.dstream

import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream}

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.{Time, StreamingContext}

private[streaming]
class QueueInputDStream[T: ClassTag](
    @transient ssc: StreamingContext,
    val queue: Queue[RDD[T]],
    oneAtATime: Boolean,
    defaultRDD: RDD[T]
  ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    throw new NotSerializableException("queueStream doesn't support checkpointing. " +
      "Please don't use queueStream when checkpointing is enabled.")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    if (oneAtATime && queue.size > 0) {
      buffer += queue.dequeue()
    } else {
      buffer ++= queue.dequeueAll(_ => true)
    }
    if (buffer.size > 0) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(ssc.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      None
    }
  }

} 
Example 62
Source File: SerializableBuffer.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
} 
Example 63
Source File: CartesianRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.util.Utils

private[spark]
class CartesianPartition(
    idx: Int,
    @transient rdd1: RDD[_],
    @transient rdd2: RDD[_],
    s1Index: Int,
    s2Index: Int
  ) extends Partition {
  var s1 = rdd1.partitions(s1Index)
  var s2 = rdd2.partitions(s2Index)
  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    s1 = rdd1.partitions(s1Index)
    s2 = rdd2.partitions(s2Index)
    oos.defaultWriteObject()
  }
}

private[spark]
class CartesianRDD[T: ClassTag, U: ClassTag](
    sc: SparkContext,
    var rdd1 : RDD[T],
    var rdd2 : RDD[U])
  extends RDD[Pair[T, U]](sc, Nil)
  with Serializable {

  val numPartitionsInRdd2 = rdd2.partitions.length

  override def getPartitions: Array[Partition] = {
    // create the cross product split
    val array = new Array[Partition](rdd1.partitions.length * rdd2.partitions.length)
    for (s1 <- rdd1.partitions; s2 <- rdd2.partitions) {
      val idx = s1.index * numPartitionsInRdd2 + s2.index
      array(idx) = new CartesianPartition(idx, rdd1, rdd2, s1.index, s2.index)
    }
    array
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    (rdd1.preferredLocations(currSplit.s1) ++ rdd2.preferredLocations(currSplit.s2)).distinct
  }

  override def compute(split: Partition, context: TaskContext): Iterator[(T, U)] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    for (x <- rdd1.iterator(currSplit.s1, context);
         y <- rdd2.iterator(currSplit.s2, context)) yield (x, y)
  }

  override def getDependencies: Seq[Dependency[_]] = List(
    new NarrowDependency(rdd1) {
      def getParents(id: Int): Seq[Int] = List(id / numPartitionsInRdd2)
    },
    new NarrowDependency(rdd2) {
      def getParents(id: Int): Seq[Int] = List(id % numPartitionsInRdd2)
    }
  )

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
  }
} 
Example 64
Source File: UnionRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient rdd: RDD[T],
    val parentRddIndex: Int,
    @transient parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdds.map(_.partitions.length).sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 65
Source File: PartitionerAwareUnionRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.length > 0)
  require(rdds.forall(_.partitioner.isDefined))
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map(index => {
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }).toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) => {
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
      }
    }
    val location = if (locations.isEmpty) {
      None
    } else {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 66
Source File: Serialization.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.commons.serialization

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

trait Serialization {

  def deserialize[T](bytes: Array[Byte]): T = {
    val bufferIn = new ByteArrayInputStream(bytes)
    val streamIn = new ObjectInputStream(bufferIn)
    try {
      streamIn.readObject().asInstanceOf[T]
    } finally {
      streamIn.close()
    }
  }

  def serialize[T](objectToSerialize: T): Array[Byte] = {
    val byteArrayOutputStream: ByteArrayOutputStream = new ByteArrayOutputStream()
    val oos = new ObjectOutputStream(byteArrayOutputStream)
    try {
      oos.writeObject(objectToSerialize)
      oos.flush()
      byteArrayOutputStream.toByteArray
    } finally {
      oos.close()
    }
  }

  def serializeDeserialize[T](obj: T): T = deserialize[T](serialize[T](obj))
}

object Serialization extends Serialization 
Example 67
Source File: JavaSerde.scala    From affinity   with Apache License 2.0 5 votes vote down vote up
package io.amient.affinity.core.serde

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectOutputStream}

import akka.actor.ExtendedActorSystem
import akka.serialization.JavaSerializer
import akka.util.ClassLoaderObjectInputStream

class JavaSerde(system: ExtendedActorSystem) extends Serde[AnyRef] {

  override def identifier: Int = 101

  override def close(): Unit = ()

  override def fromBytes(bytes: Array[Byte]): AnyRef = {
    val in = new ClassLoaderObjectInputStream(system.dynamicAccess.classLoader, new ByteArrayInputStream(bytes))
    val obj = JavaSerializer.currentSystem.withValue(system) { in.readObject }
    in.close()
    obj
  }

  override def toBytes(o: AnyRef): Array[Byte] = {
    val bos = new ByteArrayOutputStream
    val out = new ObjectOutputStream(bos)
    JavaSerializer.currentSystem.withValue(system) { out.writeObject(o) }
    out.close()
    bos.toByteArray
  }

} 
Example 68
Source File: QueueInputDStream.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.dstream

import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream}

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.{Time, StreamingContext}

private[streaming]
class QueueInputDStream[T: ClassTag](
    @transient ssc: StreamingContext,
    val queue: Queue[RDD[T]],
    oneAtATime: Boolean,
    defaultRDD: RDD[T]
  ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    throw new NotSerializableException("queueStream doesn't support checkpointing. " +
      "Please don't use queueStream when checkpointing is enabled.")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    if (oneAtATime && queue.size > 0) {
      buffer += queue.dequeue()
    } else {
      buffer ++= queue.dequeueAll(_ => true)
    }
    if (buffer.size > 0) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(ssc.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      None
    }
  }

} 
Example 69
Source File: SerializableConfiguration.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.conf.Configuration

private[spark]
class SerializableConfiguration(@transient var value: Configuration) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new Configuration(false)
    value.readFields(in)
  }
} 
Example 70
Source File: SerializableJobConf.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.mapred.JobConf

private[spark]
class SerializableJobConf(@transient var value: JobConf) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new JobConf(false)
    value.readFields(in)
  }
} 
Example 71
Source File: SerializableBuffer.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    //ByteBuffer.allocate在能够读和写之前,必须有一个缓冲区,用静态方法 allocate() 来分配缓冲区
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
} 
Example 72
Source File: CartesianRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.util.Utils

private[spark]

class CartesianPartition(
    idx: Int,
    @transient rdd1: RDD[_],
    @transient rdd2: RDD[_],
    s1Index: Int,
    s2Index: Int
  ) extends Partition {
  var s1 = rdd1.partitions(s1Index)
  var s2 = rdd2.partitions(s2Index)
  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    s1 = rdd1.partitions(s1Index)
    s2 = rdd2.partitions(s2Index)
    oos.defaultWriteObject()
  }
}

private[spark]
class CartesianRDD[T: ClassTag, U: ClassTag](
    sc: SparkContext,
    var rdd1 : RDD[T],
    var rdd2 : RDD[U])
  extends RDD[Pair[T, U]](sc, Nil)
  with Serializable {

  val numPartitionsInRdd2 = rdd2.partitions.length

  override def getPartitions: Array[Partition] = {
    // create the cross product split 创建交叉产品拆分
    val array = new Array[Partition](rdd1.partitions.length * rdd2.partitions.length)
    for (s1 <- rdd1.partitions; s2 <- rdd2.partitions) {
      val idx = s1.index * numPartitionsInRdd2 + s2.index
      array(idx) = new CartesianPartition(idx, rdd1, rdd2, s1.index, s2.index)
    }
    array
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    (rdd1.preferredLocations(currSplit.s1) ++ rdd2.preferredLocations(currSplit.s2)).distinct
  }

  override def compute(split: Partition, context: TaskContext): Iterator[(T, U)] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    for (x <- rdd1.iterator(currSplit.s1, context);
         y <- rdd2.iterator(currSplit.s2, context)) yield (x, y)
  }

  override def getDependencies: Seq[Dependency[_]] = List(
    new NarrowDependency(rdd1) {
      def getParents(id: Int): Seq[Int] = List(id / numPartitionsInRdd2)
    },
    new NarrowDependency(rdd2) {
      def getParents(id: Int): Seq[Int] = List(id % numPartitionsInRdd2)
    }
  )

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
  }
} 
Example 73
Source File: UnionRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient rdd: RDD[T],
    val parentRddIndex: Int,
    @transient parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    //在任务序列化时更新对父拆分的引用
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdds.map(_.partitions.length).sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 74
Source File: Github415.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.github

import java.io.{FileOutputStream, ObjectOutputStream}

import com.sksamuel.avro4s.Encoder
import com.sksamuel.avro4s.github.Github415.PlaybackSession
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.must.Matchers

class Github415 extends AnyFunSuite with Matchers {

  test("github 415") {
    val fileOut = new FileOutputStream("remove_me")
    val out = new ObjectOutputStream(fileOut)
    out.writeObject(Encoder[PlaybackSession])
  }
}

object Github415 {
  object Rebuffers {
    case class Metrics(count: Int)
    case class EarlyLate(early: Metrics)
    case class Stats(session: Option[EarlyLate])
  }

  case class Rebuffers(network: Option[Rebuffers.Stats])

  case class PlaybackSession(rebuffers: Option[Rebuffers])
} 
Example 75
Source File: GithubIssue485.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.github

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

import com.sksamuel.avro4s.record.decoder.CPWrapper
import com.sksamuel.avro4s.{AvroSchema, Decoder, DefaultFieldMapper}
import org.apache.avro.generic.GenericData
import org.apache.avro.util.Utf8
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers
import shapeless.Coproduct

class GithubIssue485 extends AnyFunSuite with Matchers {

  test("Serializable Coproduct Decoder #485") {
    val baos = new ByteArrayOutputStream()
    val oos = new ObjectOutputStream(baos)
    oos.writeObject(Decoder[CPWrapper])
    oos.close()

    val decoder =
      new ObjectInputStream(new ByteArrayInputStream(baos.toByteArray)).readObject().asInstanceOf[Decoder[CPWrapper]]

    val schema = AvroSchema[CPWrapper]
    val record = new GenericData.Record(schema)
    record.put("u", new Utf8("wibble"))
    decoder.decode(record) shouldBe CPWrapper(Coproduct[CPWrapper.ISBG]("wibble"))
  }
} 
Example 76
Source File: GithubIssue484.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.github

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

import com.sksamuel.avro4s.record.decoder.ScalaEnumClass
import com.sksamuel.avro4s.schema.Colours
import com.sksamuel.avro4s.{AvroSchema, Decoder, DefaultFieldMapper}
import org.apache.avro.generic.GenericData
import org.apache.avro.generic.GenericData.EnumSymbol
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

class GithubIssue484 extends AnyFunSuite with Matchers {

  test("Serializable Scala Enum Decoder #484") {
    val baos = new ByteArrayOutputStream()
    val oos = new ObjectOutputStream(baos)
    oos.writeObject(Decoder[ScalaEnumClass])
    oos.close()

    val decoder = new ObjectInputStream(new ByteArrayInputStream(baos.toByteArray))
      .readObject()
      .asInstanceOf[Decoder[ScalaEnumClass]]

    val schema = AvroSchema[ScalaEnumClass]
    val record = new GenericData.Record(schema)
    record.put("colour", new EnumSymbol(schema.getField("colour").schema(), "Green"))
    decoder.decode(record) shouldBe ScalaEnumClass(Colours.Green)
  }
} 
Example 77
Source File: GithubIssue432.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.github

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

import com.sksamuel.avro4s.Encoder
import org.scalatest.{FunSuite, Matchers}

class GithubIssue432 extends FunSuite with Matchers {

  test("Serializable Encoder[BigDecimal] #432") {
    val oos = new ObjectOutputStream(new ByteArrayOutputStream())
    oos.writeObject(Encoder.bigDecimalEncoder)
    oos.close()
  }

  test("Deserialized Encoder[BigDecimal] works") {
    val baos = new ByteArrayOutputStream()
    val oos = new ObjectOutputStream(baos)
    oos.writeObject(Encoder.bigDecimalEncoder)
    oos.close()

    val ois = new ObjectInputStream(new ByteArrayInputStream(baos.toByteArray))
    val encoder = ois.readObject().asInstanceOf[Encoder[BigDecimal]]

    encoder.encode(12.34)
  }
} 
Example 78
Source File: JsDataSpec.scala    From mist   with Apache License 2.0 5 votes vote down vote up
package mist.api.data

import java.io.{ByteArrayOutputStream, ObjectOutputStream}
import java.util
import mist.api.encoding.defaultEncoders._
import mist.api.encoding.JsSyntax._

import org.scalatest._
import org.scalatest.prop.TableDrivenPropertyChecks._

class JsDataSpec extends FunSpec with Matchers {
  import java.{lang => jl, util => ju}
  val rawToData = Table(
    ("raw", "data"),
    (1, JsNumber(1)),
    ("str", JsString("str")),
    (1.2, JsNumber(1.2)),
    (List(1, 2), JsList(Seq(JsNumber(1), JsNumber(2)))),
    (Array(1, 2), JsList(Seq(JsNumber(1), JsNumber(2)))),
    (Map("key" -> "value"), JsMap(Map("key" -> JsString("value"))))
  )

  val javaMap: ju.Map[String, jl.Integer] = {
    val m = new ju.HashMap[String, jl.Integer](1)
    m.put("test", new jl.Integer(42))
    m
  }

  val javaRawToData = Table(
    ("raw", "data"),
    (new jl.Integer(42), JsNumber(42)),
    (new jl.Double(42.0), JsNumber(42.0)),
    (ju.Arrays.asList(new jl.Integer(42)), JsList(Seq(JsNumber(42)))),
    (javaMap, JsMap(Map("test"-> JsNumber(42))))
  )


  it("should parse raw any structure") {
    forAll(rawToData) { (raw: Any, jsLike: JsData) =>
      JsData.fromScala(raw) shouldBe jsLike
    }
  }
  it("should parse raw any java structure") {
    forAll(javaRawToData){ (raw: Any, jsLike: JsData) =>
      JsData.fromJava(raw) shouldBe jsLike
    }
  }

  describe("JsLikeMap") {

    // problem with MapLike - akka can't serialize it
    // scala.collection.immutable.MapLike$$anon$2
    //    java.io.NotSerializableException: scala.collection.immutable.MapLike$$anon$2
    it("JsLikeMap should be serializable") {
      val map = Map("1" -> 1, "2" -> 2).mapValues(i => JsNumber(i))
      val jslikeMap = JsMap(map)

      val bos = new ByteArrayOutputStream
      val out = new ObjectOutputStream(bos)
      out.writeObject(jslikeMap)
      out.close()
    }
  }

  it("should return untyped map") {
    val js = JsMap(
      "a" -> 1.js,
      "b" -> false.js,
      "c" -> JsList(Seq(
        JsMap("x" -> "y".js)
      ))
    )
    val exp = Map(
      "a" -> 1,
      "b" -> false,
      "c" -> Seq(
        Map("x" -> "y")
      )
    )
    JsData.untyped(js) shouldBe exp
  }

} 
Example 79
Source File: QueueInputDStream.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.dstream

import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream}

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.{StreamingContext, Time}

private[streaming]
class QueueInputDStream[T: ClassTag](
    ssc: StreamingContext,
    val queue: Queue[RDD[T]],
    oneAtATime: Boolean,
    defaultRDD: RDD[T]
  ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    throw new NotSerializableException("queueStream doesn't support checkpointing. " +
      "Please don't use queueStream when checkpointing is enabled.")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    queue.synchronized {
      if (oneAtATime && queue.nonEmpty) {
        buffer += queue.dequeue()
      } else {
        buffer ++= queue
        queue.clear()
      }
    }
    if (buffer.nonEmpty) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(context.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      Some(ssc.sparkContext.emptyRDD)
    }
  }

} 
Example 80
Source File: SerializableConfiguration.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.conf.Configuration

private[spark]
class SerializableConfiguration(@transient var value: Configuration) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new Configuration(false)
    value.readFields(in)
  }
} 
Example 81
Source File: SerializableJobConf.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.mapred.JobConf

private[spark]
class SerializableJobConf(@transient var value: JobConf) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new JobConf(false)
    value.readFields(in)
  }
} 
Example 82
Source File: SerializableBuffer.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
} 
Example 83
Source File: CartesianRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.util.Utils

private[spark]
class CartesianPartition(
    idx: Int,
    @transient private val rdd1: RDD[_],
    @transient private val rdd2: RDD[_],
    s1Index: Int,
    s2Index: Int
  ) extends Partition {
  var s1 = rdd1.partitions(s1Index)
  var s2 = rdd2.partitions(s2Index)
  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    s1 = rdd1.partitions(s1Index)
    s2 = rdd2.partitions(s2Index)
    oos.defaultWriteObject()
  }
}

private[spark]
class CartesianRDD[T: ClassTag, U: ClassTag](
    sc: SparkContext,
    var rdd1 : RDD[T],
    var rdd2 : RDD[U])
  extends RDD[(T, U)](sc, Nil)
  with Serializable {

  val numPartitionsInRdd2 = rdd2.partitions.length

  override def getPartitions: Array[Partition] = {
    // create the cross product split
    val array = new Array[Partition](rdd1.partitions.length * rdd2.partitions.length)
    for (s1 <- rdd1.partitions; s2 <- rdd2.partitions) {
      val idx = s1.index * numPartitionsInRdd2 + s2.index
      array(idx) = new CartesianPartition(idx, rdd1, rdd2, s1.index, s2.index)
    }
    array
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    (rdd1.preferredLocations(currSplit.s1) ++ rdd2.preferredLocations(currSplit.s2)).distinct
  }

  override def compute(split: Partition, context: TaskContext): Iterator[(T, U)] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    for (x <- rdd1.iterator(currSplit.s1, context);
         y <- rdd2.iterator(currSplit.s2, context)) yield (x, y)
  }

  override def getDependencies: Seq[Dependency[_]] = List(
    new NarrowDependency(rdd1) {
      def getParents(id: Int): Seq[Int] = List(id / numPartitionsInRdd2)
    },
    new NarrowDependency(rdd2) {
      def getParents(id: Int): Seq[Int] = List(id % numPartitionsInRdd2)
    }
  )

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
  }
} 
Example 84
Source File: UnionRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.ForkJoinTaskSupport
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 85
Source File: PartitionerAwareUnionRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.nonEmpty)
  require(rdds.forall(_.partitioner.isDefined))
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map { index =>
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }.toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) =>
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
    }
    val location = if (locations.isEmpty) {
      None
    } else {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 86
Source File: Estimator.scala    From doddle-model   with Apache License 2.0 5 votes vote down vote up
package io.picnicml.doddlemodel.typeclasses

import java.io.{FileOutputStream, ObjectOutputStream}

// evidence needs to be serializable because it is persisted along with the actual
// estimators within the io.picnicml.doddlemodel.pipeline.Pipeline
trait Estimator[A] extends Serializable {

  def isFitted(model: A): Boolean

  def save(model: A, filePath: String): Unit = {
    val outputStream = new ObjectOutputStream(new FileOutputStream(filePath))
    outputStream.writeObject(model)
    outputStream.close()
  }
} 
Example 87
Source File: Terminator.scala    From ingraph   with Eclipse Public License 1.0 5 votes vote down vote up
package ingraph.ire.messages

import java.io.{IOException, ObjectInputStream, ObjectOutputStream}

import akka.actor.ActorRef
import ingraph.ire.datatypes.Tuple
import ingraph.ire.util.AtomicUniqueCounter

import scala.collection.mutable
import scala.concurrent.{Future, Promise}

class Terminator private(terminatorID: Int, val inputs: Iterable[ReteMessage => Unit], production: ActorRef) extends ReteMessage with Serializable {
  var lastMessageID = -1

  def send(): Future[Iterable[Tuple]] = {
    val messageID = Terminator.idCounter.getNext
    lastMessageID = messageID
    val promise = Promise[Iterable[Tuple]]
    production ! ExpectTerminator(terminatorID, messageID, promise)
    val future = promise.future
    inputs.foreach(input => {
      input(Pause(messageID))
      input(TerminatorMessage(terminatorID, messageID))
    })
    future
  }

  def resend(): Future[Iterable[Tuple]] = {
    val promise = Promise[Iterable[Tuple]]
    production ! ExpectTerminator(terminatorID, lastMessageID, promise)
    val future = promise.future
    inputs.foreach(input => {
      input(TerminatorMessage(terminatorID, lastMessageID))
    })
    future
  }

  @throws(classOf[IOException])
  private def writeObject(out: ObjectOutputStream): Unit = {}

  @throws(classOf[IOException])
  private def readObject(in: ObjectInputStream): Unit = {}
}

object Terminator {
  val idCounter = new AtomicUniqueCounter

  def apply(inputs: Iterable[ReteMessage => Unit], productionNode: ActorRef): Terminator = {
    val id = idCounter.getNext
    productionNode ! ExpectMoreTerminators(id, inputs)
    new Terminator(id, inputs, productionNode)
  }
}


trait TerminatorHandler {
  val expectedTerminatorCount: Int
  val terminatorCount = new mutable.HashMap[Int, Int]

  def forward(terminator: TerminatorMessage)

  def handleTerminator(terminator: TerminatorMessage): Unit = {
    val count = terminatorCount.getOrElse(terminator.messageID, 0) + 1
    if (count >= expectedTerminatorCount) {
      forward(terminator)
      terminatorCount -= terminator.messageID
    }
    terminatorCount(terminator.messageID) = count
  }
} 
Example 88
Source File: QueueInputDStream.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.dstream

import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream}

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.{Time, StreamingContext}

private[streaming]
class QueueInputDStream[T: ClassTag](
    ssc: StreamingContext,
    val queue: Queue[RDD[T]],
    oneAtATime: Boolean,
    defaultRDD: RDD[T]
  ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    throw new NotSerializableException("queueStream doesn't support checkpointing. " +
      "Please don't use queueStream when checkpointing is enabled.")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    if (oneAtATime && queue.size > 0) {
      buffer += queue.dequeue()
    } else {
      buffer ++= queue.dequeueAll(_ => true)
    }
    if (buffer.size > 0) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(context.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      Some(ssc.sparkContext.emptyRDD)
    }
  }

} 
Example 89
Source File: SerializableConfiguration.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.conf.Configuration

private[spark]
class SerializableConfiguration(@transient var value: Configuration) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new Configuration(false)
    value.readFields(in)
  }
} 
Example 90
Source File: SerializableJobConf.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.mapred.JobConf

private[spark]
class SerializableJobConf(@transient var value: JobConf) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    value = new JobConf(false)
    value.readFields(in)
  }
} 
Example 91
Source File: SerializableBuffer.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util

import java.io.{EOFException, IOException, ObjectInputStream, ObjectOutputStream}
import java.nio.ByteBuffer
import java.nio.channels.Channels


private[spark]
class SerializableBuffer(@transient var buffer: ByteBuffer) extends Serializable {
  def value: ByteBuffer = buffer

  private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException {
    val length = in.readInt()
    buffer = ByteBuffer.allocate(length)
    var amountRead = 0
    val channel = Channels.newChannel(in)
    while (amountRead < length) {
      val ret = channel.read(buffer)
      if (ret == -1) {
        throw new EOFException("End of file before fully reading buffer")
      }
      amountRead += ret
    }
    buffer.rewind() // Allow us to read it later
  }

  private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException {
    out.writeInt(buffer.limit())
    if (Channels.newChannel(out).write(buffer) != buffer.limit()) {
      throw new IOException("Could not fully write buffer to output stream")
    }
    buffer.rewind() // Allow us to write it again later
  }
} 
Example 92
Source File: CartesianRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.util.Utils

private[spark]
class CartesianPartition(
    idx: Int,
    @transient private val rdd1: RDD[_],
    @transient private val rdd2: RDD[_],
    s1Index: Int,
    s2Index: Int
  ) extends Partition {
  var s1 = rdd1.partitions(s1Index)
  var s2 = rdd2.partitions(s2Index)
  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    s1 = rdd1.partitions(s1Index)
    s2 = rdd2.partitions(s2Index)
    oos.defaultWriteObject()
  }
}

private[spark]
class CartesianRDD[T: ClassTag, U: ClassTag](
    sc: SparkContext,
    var rdd1 : RDD[T],
    var rdd2 : RDD[U])
  extends RDD[Pair[T, U]](sc, Nil)
  with Serializable {

  val numPartitionsInRdd2 = rdd2.partitions.length

  override def getPartitions: Array[Partition] = {
    // create the cross product split
    val array = new Array[Partition](rdd1.partitions.length * rdd2.partitions.length)
    for (s1 <- rdd1.partitions; s2 <- rdd2.partitions) {
      val idx = s1.index * numPartitionsInRdd2 + s2.index
      array(idx) = new CartesianPartition(idx, rdd1, rdd2, s1.index, s2.index)
    }
    array
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    (rdd1.preferredLocations(currSplit.s1) ++ rdd2.preferredLocations(currSplit.s2)).distinct
  }

  override def compute(split: Partition, context: TaskContext): Iterator[(T, U)] = {
    val currSplit = split.asInstanceOf[CartesianPartition]
    for (x <- rdd1.iterator(currSplit.s1, context);
         y <- rdd2.iterator(currSplit.s2, context)) yield (x, y)
  }

  override def getDependencies: Seq[Dependency[_]] = List(
    new NarrowDependency(rdd1) {
      def getParents(id: Int): Seq[Int] = List(id / numPartitionsInRdd2)
    },
    new NarrowDependency(rdd2) {
      def getParents(id: Int): Seq[Int] = List(id % numPartitionsInRdd2)
    }
  )

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
  }
} 
Example 93
Source File: UnionRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdds.map(_.partitions.length).sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 94
Source File: PartitionerAwareUnionRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.length > 0)
  require(rdds.forall(_.partitioner.isDefined))
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map(index => {
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }).toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) => {
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
      }
    }
    val location = if (locations.isEmpty) {
      None
    } else {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 95
Source File: SerializationTestHelper.scala    From xmlconfect   with Apache License 2.0 5 votes vote down vote up
package com.mthaler.xmlconfect

import java.io.{ ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream }

object SerializationTestHelper {

  
  def serializeDeserialize[T](obj: T): T = {
    val bout = new ByteArrayOutputStream()
    val out = new ObjectOutputStream(bout)
    out.writeObject(obj)
    val bin = new ByteArrayInputStream(bout.toByteArray)
    val in = new ObjectInputStream(bin)
    in.readObject().asInstanceOf[T]
  }
} 
Example 96
Source File: SerializableConfiguration.scala    From spark-acid   with Apache License 2.0 5 votes vote down vote up
package com.qubole.spark.hiveacid.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.conf.Configuration


private[hiveacid] class SerializableConfiguration(@transient var value: Configuration)
  extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = Util.tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = Util.tryOrIOException {
    value = new Configuration(false)
    value.readFields(in)
  }
} 
Example 97
Source File: DefineMacroCmd.scala    From piglet   with Apache License 2.0 5 votes vote down vote up
package dbis.piglet.op.cmd

import java.io.{ObjectInputStream, ByteArrayInputStream, ObjectOutputStream, ByteArrayOutputStream}
import dbis.piglet.plan.DataflowPlan
import scala.collection.mutable.ListBuffer
import dbis.piglet.op.{Pipe,PigOperator}


case class DefineMacroCmd(
    out: Pipe, 
    macroName: String, 
    params: Option[List[String]], 
    stmts: List[PigOperator]
  ) extends PigOperator(out) {

  var subPlan: Option[DataflowPlan] = None
  var inPipes = List[Pipe]()

  def deepClone(): DefineMacroCmd = {
      val baos = new ByteArrayOutputStream()
      val oos = new ObjectOutputStream(baos)
      oos.writeObject(this)
      val bais = new ByteArrayInputStream(baos.toByteArray())
      val ois = new ObjectInputStream(bais)
      ois.readObject().asInstanceOf[DefineMacroCmd]
  }

  override def preparePlan: Unit = {
    
  def pipeParamPositions(): List[Int] = {
    val l = ListBuffer[Int]()
    inPipes.foreach(i => {
      val pos = params.get.indexOf(i.name.substring(1))
      if (pos >= 0) l += pos
    })
    l.toList
  }
} 
Example 98
Source File: SerializableConfiguration.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark.datasources

import java.io.{IOException, ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.conf.Configuration
import org.apache.yetus.audience.InterfaceAudience;

import scala.util.control.NonFatal

@InterfaceAudience.Private
class SerializableConfiguration(@transient var value: Configuration) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = tryOrIOException {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = tryOrIOException {
    value = new Configuration(false)
    value.readFields(in)
  }

  def tryOrIOException(block: => Unit) {
    try {
      block
    } catch {
      case e: IOException => throw e
      case NonFatal(t) => throw new IOException(t)
    }
  }
} 
Example 99
Source File: GBDTModel.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.tree.gbdt

import java.io.{FileInputStream, FileOutputStream, ObjectInputStream, ObjectOutputStream}

import com.tencent.angel.sona.tree.gbdt.tree.{GBDTParam, GBTNode}
import com.tencent.angel.sona.tree.regression.RegTree
import org.apache.spark.ml.linalg.Vector

import scala.collection.mutable.ArrayBuffer

object GBDTModel {
  type GBTTree = RegTree[GBTNode]

  def save(model: GBDTModel, path: String): Unit = {
    val oos = new ObjectOutputStream(new FileOutputStream(path))
    oos.writeObject(model)
    oos.close()
  }

  def load(path: String): GBDTModel = {
    val ois = new ObjectInputStream(new FileInputStream(path))
    ois.readObject().asInstanceOf[GBDTModel]
  }
}

import GBDTModel._
class GBDTModel(val param: GBDTParam) extends Serializable {
  private var forest: ArrayBuffer[GBTTree] = ArrayBuffer[GBTTree]()
  private var weights: ArrayBuffer[Float] = ArrayBuffer[Float]()

  def predict(instance: Vector): Array[Float] = {
    if (param.isRegression || param.numClass == 2) {
      var pred = 0.0f
      for (i <- forest.indices)
        pred += weights(i) * forest(i).predictBinary(instance)
      Array(pred)
    } else if (param.multiTree) {
      val preds = Array.ofDim[Float](param.numClass)
      for (i <- forest.indices)
        preds(i % param.numClass) += weights(i) *
          forest(i).predictBinary(instance)
      preds
    } else {
      val preds = Array.ofDim[Float](param.numClass)
      for (i <- forest.indices) {
        val p = forest(i).predictMulti(instance)
        val w = weights(i)
        for (k <- 0 until param.numClass)
          preds(k) += w * p(k)
      }
      preds
    }
  }

  def predict(instances: Array[Vector]): Array[Array[Float]] = {
    instances.map(predict)
  }

  def get(treeId: Int): GBTTree = forest(treeId)

  def add(tree: GBTTree, weight: Float): Unit = {
    forest += tree
    weights += weight
  }

  def keepFirstTrees(num: Int): Unit = {
    forest = forest.slice(0, num)
    weights = weights.slice(0, num)
  }

  def numTree: Int = forest.size
} 
Example 100
Source File: SerializableConfiguration.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.sql.util

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.conf.Configuration

class SerializableConfiguration(@transient var value: Configuration) extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = {
    out.defaultWriteObject()
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = {
    value = new Configuration(false)
    value.readFields(in)
  }
} 
Example 101
Source File: ModelSerializationTestHelper.scala    From aloha   with MIT License 5 votes vote down vote up
package com.eharmony.aloha

import java.io.{ObjectInputStream, ByteArrayInputStream, ByteArrayOutputStream, ObjectOutputStream}


trait ModelSerializationTestHelper {
  def serializeDeserializeRoundTrip[A <: java.io.Serializable](a: A): A = {
    val baos = new ByteArrayOutputStream()
    val oos = new ObjectOutputStream(baos)
    oos.writeObject(a)
    val bais = new ByteArrayInputStream(baos.toByteArray)
    val ois = new ObjectInputStream(bais)
    val out = ois.readObject()
    out.asInstanceOf[A]
  }
} 
Example 102
Source File: TensorFlowModel.scala    From model-serving-tutorial   with Apache License 2.0 5 votes vote down vote up
package com.lightbend.modelserving.model.tensorflow

import java.io.{IOException, ObjectInputStream, ObjectOutputStream}

import com.lightbend.model.modeldescriptor.ModelDescriptor
import com.lightbend.modelserving.model.Model
import org.tensorflow.{Graph, Session}


  override def getType: ModelDescriptor.ModelType = ModelDescriptor.ModelType.TENSORFLOW

  override def equals(obj: Any): Boolean = {
    obj match {
      case tfModel: TensorFlowModel[RECORD,RESULT] =>
        tfModel.toBytes.toList == inputStream.toList
      case _ => false
    }
  }

  private def writeObject(output: ObjectOutputStream): Unit = {
    val start = System.currentTimeMillis()
    output.writeObject(bytes)
    println(s"TensorFlow java serialization in ${System.currentTimeMillis() - start} ms")
  }

  private def readObject(input: ObjectInputStream): Unit = {
    val start = System.currentTimeMillis()
    bytes = input.readObject().asInstanceOf[Array[Byte]]
    try{
      graph = new Graph
      graph.importGraphDef(bytes)
      session = new Session(graph)
      println(s"TensorFlow java deserialization in ${System.currentTimeMillis() - start} ms")
    }
    catch {
      case t: Throwable =>
        t.printStackTrace
        println(s"TensorFlow java deserialization failed in ${System.currentTimeMillis() - start} ms")
        println(s"Restored TensorFlow ${new String(bytes)}")
    }
  }
} 
Example 103
Source File: Serialization.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.commons.serialization

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream}

trait Serialization {

  def deserialize[T](bytes: Array[Byte]): T = {
    val bufferIn = new ByteArrayInputStream(bytes)
    val streamIn = new ObjectInputStream(bufferIn)
    try {
      streamIn.readObject().asInstanceOf[T]
    } finally {
      streamIn.close()
    }
  }

  def serialize[T](objectToSerialize: T): Array[Byte] = {
    val byteArrayOutputStream: ByteArrayOutputStream = new ByteArrayOutputStream()
    val oos = new ObjectOutputStream(byteArrayOutputStream)
    try {
      oos.writeObject(objectToSerialize)
      oos.flush()
      byteArrayOutputStream.toByteArray
    } finally {
      oos.close()
    }
  }

  def serializeDeserialize[T](obj: T): T = deserialize[T](serialize[T](obj))
}

object Serialization extends Serialization 
Example 104
Source File: IncrementalCache.scala    From sbt-idea-plugin   with Apache License 2.0 5 votes vote down vote up
package org.jetbrains.sbtidea.packaging.artifact

import java.io.{BufferedOutputStream, ByteArrayInputStream, ObjectInputStream, ObjectOutputStream}
import java.nio.file.{Files, Path}

import sbt.Keys.TaskStreams

import scala.collection.mutable

trait IncrementalCache extends AutoCloseable {
  def fileChanged(in: Path): Boolean
}

class DumbIncrementalCache extends IncrementalCache {
  override def fileChanged(in: Path): Boolean = true
  override def close(): Unit = ()
}

class PersistentIncrementalCache(private val root: Path)(implicit private val streams: TaskStreams) extends IncrementalCache {

  private val FILENAME = "sbtidea.cache"
  private val myFile   = root.resolve(FILENAME)
  private val myData   = loadOrCreate()

  type Data = mutable.HashMap[String, Long]

  private def loadFromDisk(): Either[String, Data] = {
    if (!Files.exists(myFile) || Files.size(myFile) <= 0)
      return Left("Cache file is empty or doesn't exist")
    val data = Files.readAllBytes(myFile)
    using(new ObjectInputStream(new ByteArrayInputStream(data))) { stream =>
      Right(stream.readObject().asInstanceOf[Data])
    }
  }

  private def loadOrCreate(): Data = loadFromDisk() match {
    case Left(message) =>
      streams.log.info(message)
      new Data()
    case Right(value) => value
  }

  private def saveToDisk(): Unit = {
    import java.nio.file.StandardOpenOption._
    if (!Files.exists(myFile.getParent)) {
      Files.createDirectories(myFile.getParent)
      Files.createFile(myFile)
    }
    using(new ObjectOutputStream(
          new BufferedOutputStream(
            Files.newOutputStream(myFile, CREATE, WRITE, TRUNCATE_EXISTING)))) { stream =>
      stream.writeObject(myData)
    }
  }

  override def close(): Unit = saveToDisk()

  override def fileChanged(in: Path): Boolean = {
    val newTimestamp = Files.getLastModifiedTime(in).toMillis
    val inStr = in.toString
    val lastTimestamp = myData.getOrElseUpdate(inStr, newTimestamp)
    val result = newTimestamp > lastTimestamp
    myData.put(inStr, newTimestamp)
    result
  }
}