org.apache.spark.Partition Scala Examples

The following examples show how to use org.apache.spark.Partition. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: AnalyticsRDD.scala    From couchbase-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.couchbase.spark.rdd

import com.couchbase.client.core.message.cluster.{GetClusterConfigRequest, GetClusterConfigResponse}
import com.couchbase.client.core.service.ServiceType
import com.couchbase.client.java.analytics.AnalyticsQuery
import com.couchbase.client.java.query.N1qlQuery
import com.couchbase.spark.connection._
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, SparkContext, TaskContext}
import rx.lang.scala.JavaConversions.toScalaObservable

import scala.concurrent.duration.Duration


class AnalyticsRDD(@transient private val sc: SparkContext, query: AnalyticsQuery,
                   bucketName: String = null,
                   timeout: Option[Duration] = None)
  extends RDD[CouchbaseAnalyticsRow](sc, Nil) {

  private val cbConfig = CouchbaseConfig(sc.getConf)

  override def compute(split: Partition, context: TaskContext): Iterator[CouchbaseAnalyticsRow] =
    new AnalyticsAccessor(cbConfig, Seq(query), bucketName, timeout).compute()

  override protected def getPartitions: Array[Partition] = {
    // Try to run the query on a Spark worker co-located on a Couchbase analytics node
    val addressesWithAnalyticsService = RDDSupport.couchbaseNodesWithService(cbConfig,
      bucketName,
      ServiceType.ANALYTICS)

    // A single query can only run on one node, so return one partition
    Array(new QueryPartition(0, addressesWithAnalyticsService))
  }

  override protected def getPreferredLocations(split: Partition): Seq[String] = {
    RDDSupport.getPreferredLocations(split)
  }
} 
Example 2
Source File: SplitRDD.scala    From spark-bam   with Apache License 2.0 5 votes vote down vote up
package org.hammerlab.bam.spark.load

import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.lib.input
import org.apache.spark.{ Partition, SparkContext, TaskContext }
import org.apache.spark.rdd.RDD

import scala.collection.JavaConverters._

case class FileSplitPartition(index: Int,
                              start: Long,
                              end: Long,
                              locations: Array[String])
  extends Partition

class SplitRDD private(@transient override val getPartitions: Array[Partition])(implicit sc: SparkContext)
  extends RDD[(Long, Long)](sc, Nil) {
  override def compute(split: Partition, context: TaskContext) =
    Iterator(
      split.asInstanceOf[FileSplitPartition]
    )
    .map(
      fs ⇒
        fs.start → fs.end
    )

  override protected def getPreferredLocations(split: Partition) =
    split
      .asInstanceOf[FileSplitPartition]
      .locations
}

object SplitRDD {
  def apply(splits: java.util.List[InputSplit])(implicit sc: SparkContext): SplitRDD =
    new SplitRDD(
      splits
        .iterator()
        .asScala
        .map(_.asInstanceOf[input.FileSplit])
        .zipWithIndex
        .map {
          case (fs, idx) ⇒
            FileSplitPartition(
              idx,
              fs.getStart,
              fs.getStart + fs.getLength,
              fs.getLocations
            )
        }
        .toArray
    )
} 
Example 3
Source File: Scanner.scala    From spark-vector   with Apache License 2.0 5 votes vote down vote up
package com.actian.spark_vector.datastream.reader

import scala.language.reflectiveCalls

import org.apache.spark.{ Partition, SparkContext, TaskContext }
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow

import com.actian.spark_vector.datastream.VectorEndpointConf


  @volatile private var it: RowReader = _
  
  override protected def getPartitions = (0 until readConf.size).map(idx => new Partition { def index = idx }).toArray

  override protected def getPreferredLocations(split: Partition) = Seq(readConf.vectorEndpoints(split.index).host)
  
  override def compute(split: Partition, taskContext: TaskContext): Iterator[InternalRow] = {
    taskContext.addTaskCompletionListener { _ => closeAll() }
    taskContext.addTaskFailureListener { (_, e) => closeAll(Option(e)) }
    logDebug("Computing partition " + split.index)
    try {
      it = reader.read(split.index)
      it
    } catch { case e: Exception => 
      logDebug("Exception occurred when attempting to read from stream. If termination was abnormal an additional exception will be thrown.", e)
      Iterator.empty
    }
  }
  
  def touchDatastreams(parts: List[Int] = List[Int]()) {
    val untouched = List.range(0, readConf.size).diff(parts)
    untouched.foreach ( p =>
      try {
        reader.touch(p) //Need to ensure all the streams have been closed except the one used by this instance
        logDebug(s"Closed partition $p Vector transfer datastream")
      } catch {
        case e: Exception => logDebug("Exception while closing unused Vector transfer datastream " + e.toString())
      }
    )
  }
  
  def closeAll(failure: Option[Throwable] = None): Unit = {
    failure.foreach(logError("Failure during task completion, closing RowReader", _))
    if (it != null) {
      close(it, "RowReader")
      it = null;
    }
  }
  
  private def close[T <: { def close() }](c: T, resourceName: String): Unit = if (c != null) {
    try { c.close } catch { case e: Exception => logWarning(s"Exception closing $resourceName", e) }
  }
  
} 
Example 4
Source File: InsertRDD.scala    From spark-vector   with Apache License 2.0 5 votes vote down vote up
package com.actian.spark_vector.datastream.writer

import scala.annotation.tailrec
import scala.reflect.ClassTag

import org.apache.spark.{ OneToOneDependency, NarrowDependency, Partition, TaskContext }
import org.apache.spark.rdd.RDD

import com.actian.spark_vector.datastream.{ DataStreamPartition, DataStreamPartitionAssignment, VectorEndpointConf }


  private val endPointsToParentPartitionsMap = {
    val affinities = rdd.partitions.map(getPreferredLocationsRec(rdd, _))

    val ret = DataStreamPartitionAssignment(affinities, writeConf.vectorEndpoints)
    logDebug(s"Computed endPointsToParentPartitionsMap and got: ${
      (0 until ret.length).map {
        case idx =>
          val vals = ret(idx)
          s"Datastream $idx -> RDD partitions ${vals.length}: ${vals.take(partitionsPerDataStreamToPrint).mkString(",")} ${if (vals.length > partitionsPerDataStreamToPrint) "..." else ""}"
      }
    }")
    ret.map(_.map(rdd.partitions(_).index))
  }

  override protected def getPartitions = (0 until writeConf.size).map(idx =>
    DataStreamPartition(idx, rdd, endPointsToParentPartitionsMap(idx))).toArray

  override protected def getPreferredLocations(split: Partition) = {
    logDebug(s"getPreferredLocations is called for partition ${split.index} and we are returning ${writeConf.vectorEndpoints(split.index).host}")
    Seq(writeConf.vectorEndpoints(split.index).host)
  }

  override def compute(split: Partition, taskContext: TaskContext): Iterator[R] =
    split.asInstanceOf[DataStreamPartition].parents.toIterator.flatMap(firstParent[R].iterator(_, taskContext))

  override def getDependencies: Seq[NarrowDependency[_]] = Seq(new NarrowDependency(rdd) {
    def getParents(partitionId: Int) = endPointsToParentPartitionsMap(partitionId)
  })
} 
Example 5
Source File: PartitionwiseWeightedSampledRDD.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.graph.utils
import java.util.Random

import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, Partitioner, TaskContext}

import scala.reflect.ClassTag
import scala.util.{Random => ScalaRandom}

class PartitionwiseWeightedSampledRDDPartition(val prev: Partition, val seed: Long, val fraction: Double)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


class PartitionwiseWeightedSampledRDD[T: ClassTag, U: ClassTag](
                                                                 prev: RDD[(T, Float)],
                                                                 sampler: WeightedRandomSampler[T, U],
                                                                 fractions: Map[Int, Double],
                                                                 preservesPartitioning: Boolean,
                                                                 @transient private val seed: Long = ScalaRandom.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner: Option[Partitioner] = {
    if (preservesPartitioning) prev.partitioner else None
  }

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[(T, Float)].partitions.map { x =>
      new PartitionwiseWeightedSampledRDDPartition(x, random.nextLong(), fractions.getOrElse(x.index, 0.0))
    }
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    firstParent[(T, Float)].preferredLocations(
      split.asInstanceOf[PartitionwiseWeightedSampledRDDPartition].prev
    )
  }

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseWeightedSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.setFraction(split.fraction)
    thisSampler.sample(firstParent[(T, Float)].iterator(split.prev, context))
  }
} 
Example 6
Source File: PartitionwiseSampledRDD.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.rdd

import java.util.Random

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.util.random.RandomSampler

import scala.reflect.ClassTag

private[sona] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}

/**
  * An RDD sampled from its parent RDD partition-wise. For each partition of the parent RDD,
  * a user-specified [[org.apache.spark.util.random.RandomSampler]] instance is used to obtain
  * a random sample of the records in the partition. The random seeds assigned to the samplers
  * are guaranteed to have different values.
  *
  * @param prev                  RDD to be sampled
  * @param sampler               a random sampler
  * @param preservesPartitioning whether the sampler preserves the partitioner of the parent RDD
  * @param seed                  random seed
  * @tparam T input RDD item type
  * @tparam U sampled RDD item type
  */
private[sona] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
                                                                       prev: RDD[T],
                                                                       sampler: RandomSampler[T, U],
                                                                       preservesPartitioning: Boolean,
                                                                       @transient private val seed: Long = (new Random).nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 7
Source File: SlidingRDD.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.rdd

import scala.collection.mutable
import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD

 private[angel]
class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T], val offset: Int)
  extends Partition with Serializable {
  override val index: Int = idx
}

/**
 * Represents an RDD from grouping items of its parent RDD in fixed size blocks by passing a sliding
 * window over them. The ordering is first based on the partition index and then the ordering of
 * items within each partition. This is similar to sliding in Scala collections, except that it
 * becomes an empty RDD if the window size is greater than the total number of items. It needs to
 * trigger a Spark job if the parent RDD has more than one partitions. To make this operation
 * efficient, the number of items per partition should be larger than the window size and the
 * window size should be small, e.g., 2.
 *
 * @param parent the parent RDD
 * @param windowSize the window size, must be greater than 1
 * @param step step size for windows
 *
 * @see `org.apache.spark.ml.rdd.RDDFunctions.sliding(Int, Int)*`
 * @see `scala.collection.IterableLike.sliding(Int, Int)*`
 */
 private[angel]
class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int)
  extends RDD[Array[T]](parent) {

  require(windowSize > 0 && step > 0 && !(windowSize == 1 && step == 1),
    "Window size and step must be greater than 0, " +
      s"and they cannot be both 1, but got windowSize = $windowSize and step = $step.")

  override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = {
    val part = split.asInstanceOf[SlidingRDDPartition[T]]
    (firstParent[T].iterator(part.prev, context) ++ part.tail)
      .drop(part.offset)
      .sliding(windowSize, step)
      .withPartial(false)
      .map(_.toArray)
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev)

  override def getPartitions: Array[Partition] = {
    val parentPartitions = parent.partitions
    val n = parentPartitions.length
    if (n == 0) {
      Array.empty
    } else if (n == 1) {
      Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty, 0))
    } else {
      val w1 = windowSize - 1
      // Get partition sizes and first w1 elements.
      val (sizes, heads) = parent.mapPartitions { iter =>
        val w1Array = iter.take(w1).toArray
        Iterator.single((w1Array.length + iter.length, w1Array))
      }.collect().unzip
      val partitions = mutable.ArrayBuffer.empty[SlidingRDDPartition[T]]
      var i = 0
      var cumSize = 0
      var partitionIndex = 0
      while (i < n) {
        val mod = cumSize % step
        val offset = if (mod == 0) 0 else step - mod
        val size = sizes(i)
        if (offset < size) {
          val tail = mutable.ListBuffer.empty[T]
          // Keep appending to the current tail until it has w1 elements.
          var j = i + 1
          while (j < n && tail.length < w1) {
            tail ++= heads(j).take(w1 - tail.length)
            j += 1
          }
          if (sizes(i) + tail.length >= offset + windowSize) {
            partitions +=
              new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail, offset)
            partitionIndex += 1
          }
        }
        cumSize += size
        i += 1
      }
      partitions.toArray
    }
  }

  // TODO: Override methods such as aggregate, which only requires one Spark job.
} 
Example 8
Source File: KeyValueRDD.scala    From couchbase-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.couchbase.spark.rdd


import java.net.InetAddress
import java.util.zip.CRC32

import com.couchbase.client.core.config.CouchbaseBucketConfig
import com.couchbase.client.core.message.cluster.{GetClusterConfigRequest, GetClusterConfigResponse}
import com.couchbase.client.java.document.Document
import com.couchbase.spark.Logging
import com.couchbase.spark.connection.{CouchbaseConfig, CouchbaseConnection, KeyValueAccessor}
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, SparkContext, TaskContext}

import scala.reflect.ClassTag
import rx.lang.scala.JavaConversions._

import scala.concurrent.duration.Duration

class KeyValuePartition(id: Int, docIds: Seq[String], loc: Option[InetAddress]) extends Partition {
  override def index: Int = id
  def ids: Seq[String] = docIds
  def location: Option[InetAddress] = loc
  override def toString = s"KeyValuePartition($id, $docIds, $loc)"
}

class KeyValueRDD[D <: Document[_]]
  (@transient private val sc: SparkContext, ids: Seq[String], bname: String = null,
   timeout: Option[Duration] = None)
  (implicit ct: ClassTag[D])
  extends RDD[D](sc, Nil) {

  private val cbConfig = CouchbaseConfig(sc.getConf)
  private val bucketName = Option(bname).getOrElse(cbConfig.buckets.head.name)

  override def compute(split: Partition, context: TaskContext): Iterator[D] = {
    val p = split.asInstanceOf[KeyValuePartition]
    new KeyValueAccessor[D](cbConfig, p.ids, bucketName, timeout).compute()
  }

  override protected def getPartitions: Array[Partition] = {
    val core = CouchbaseConnection().bucket(cbConfig, bucketName).core()

    val req = new GetClusterConfigRequest()
    val config = toScalaObservable(core.send[GetClusterConfigResponse](req))
      .map(c => {
        logWarning(c.config().bucketConfigs().toString)
        logWarning(bucketName)
        c.config().bucketConfig(bucketName)
      })
      .toBlocking
      .single

    val parts = config match {
      case bucketConfig: CouchbaseBucketConfig =>
        val numPartitions = bucketConfig.numberOfPartitions()
        var partitionIndex = 0
        ids.groupBy(id => {
          val crc32 = new CRC32()
          crc32.update(id.getBytes("UTF-8"))
          val rv = (crc32.getValue >> 16) & 0x7fff
          rv.toInt & numPartitions - 1
        }).map(grouped => {
          val hostname = Some(
            bucketConfig.nodeAtIndex(bucketConfig.nodeIndexForMaster(grouped._1, false)).hostname()
          )
          val currentIdx = partitionIndex
          partitionIndex += 1
          new KeyValuePartition(currentIdx, grouped._2,
            Some(InetAddress.getByName(hostname.get)))
        }).toArray
      case _ =>
        logWarning("Memcached preferred locations currently not supported.")
        Array(new KeyValuePartition(0, ids, None))
    }

    parts.asInstanceOf[Array[Partition]]
  }

  override protected def getPreferredLocations(split: Partition): Seq[String] = {
    val p = split.asInstanceOf[KeyValuePartition]
    if (p.location.isDefined) {
      Seq(p.location.get.getHostName, p.location.get.getHostAddress)
    } else {
      Nil
    }
  }

} 
Example 9
Source File: SpatialViewRDD.scala    From couchbase-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.couchbase.spark.rdd

import com.couchbase.client.java.document.json.JsonObject
import com.couchbase.client.java.view.SpatialViewQuery
import com.couchbase.spark.connection.{CouchbaseConfig, SpatialViewAccessor}
import org.apache.spark.{Partition, SparkContext, TaskContext}
import org.apache.spark.rdd.RDD

import scala.concurrent.duration.Duration

case class CouchbaseSpatialViewRow(id: String, key: Any, value: Any, geometry: JsonObject)

class SpatialViewRDD
  (@transient private val sc: SparkContext, viewQuery: SpatialViewQuery, bucketName: String = null,
   timeout: Option[Duration] = None)
  extends RDD[CouchbaseSpatialViewRow](sc, Nil) {

  private val cbConfig = CouchbaseConfig(sc.getConf)

  override def compute(split: Partition, context: TaskContext):
    Iterator[CouchbaseSpatialViewRow] =
    new SpatialViewAccessor(cbConfig, Seq(viewQuery), bucketName, timeout).compute()

  override protected def getPartitions: Array[Partition] = Array(new CouchbasePartition(0))

}

object SpatialViewRDD {
  def apply(sc: SparkContext, bucketName: String, viewQuery: SpatialViewQuery,
            timeout: Option[Duration] = None) =
    new SpatialViewRDD(sc, viewQuery, bucketName, timeout)
} 
Example 10
Source File: QueryRDD.scala    From couchbase-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.couchbase.spark.rdd

import com.couchbase.client.core.message.cluster.{GetClusterConfigRequest, GetClusterConfigResponse}
import com.couchbase.client.core.service.ServiceType
import com.couchbase.client.java.document.json.JsonObject
import com.couchbase.client.java.query.N1qlQuery
import com.couchbase.spark.connection.{CouchbaseConfig, CouchbaseConnection, QueryAccessor}
import org.apache.spark.{Partition, SparkContext, TaskContext}
import org.apache.spark.rdd.RDD
import rx.lang.scala.JavaConversions.toScalaObservable

import scala.concurrent.duration.Duration

case class CouchbaseQueryRow(value: JsonObject)

class QueryPartition(val index: Int, val hostnames: Seq[String]) extends Partition {
  override def toString = s"QueryPartition($index, $hostnames)"
}

class QueryRDD(@transient private val sc: SparkContext, query: N1qlQuery,
               bucketName: String = null,
               timeout: Option[Duration] = None)
  extends RDD[CouchbaseQueryRow](sc, Nil) {

  private val cbConfig = CouchbaseConfig(sc.getConf)

  override def compute(split: Partition, context: TaskContext): Iterator[CouchbaseQueryRow] =
    new QueryAccessor(cbConfig, Seq(query), bucketName, timeout).compute()

  override protected def getPartitions: Array[Partition] = {
    // Try to run the query on a Spark worker co-located on a Couchbase query node
    val addressesWithQueryService = RDDSupport.couchbaseNodesWithService(cbConfig,
      bucketName,
      ServiceType.QUERY)

    // A single query can only run on one node, so return one partition
    Array(new QueryPartition(0, addressesWithQueryService))
  }

  override protected def getPreferredLocations(split: Partition): Seq[String] = {
    RDDSupport.getPreferredLocations(split)
  }
}

object QueryRDD {

  def apply(sc: SparkContext, bucketName: String, query: N1qlQuery,
            timeout: Option[Duration] = None) =
    new QueryRDD(sc, query, bucketName, timeout)

} 
Example 11
Source File: RDDSupport.scala    From couchbase-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.couchbase.spark.rdd

import com.couchbase.client.core.message.cluster.{GetClusterConfigRequest, GetClusterConfigResponse}
import com.couchbase.client.core.service.ServiceType
import com.couchbase.spark.connection.{CouchbaseConfig, CouchbaseConnection}
import org.apache.spark.Partition
import rx.lang.scala.JavaConversions.toScalaObservable


  def getPreferredLocations(split: Partition): Seq[String] = {
    val p = split.asInstanceOf[QueryPartition]

    // If the user has co-located Spark worker services on Couchbase nodes, this will get the query
    // to run on a Spark worker running on a relevant Couchbase node, if possible
    val out = if (p.hostnames.nonEmpty) {
      p.hostnames
    } else {
      Nil
    }
    out
  }

} 
Example 12
Source File: SubdocLookupRDD.scala    From couchbase-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.couchbase.spark.rdd

import java.net.InetAddress
import java.util.zip.CRC32

import com.couchbase.client.core.config.CouchbaseBucketConfig
import com.couchbase.client.core.message.cluster.{GetClusterConfigRequest, GetClusterConfigResponse}
import com.couchbase.spark.Logging
import com.couchbase.spark.connection._
import org.apache.spark.{Partition, SparkContext, TaskContext}
import org.apache.spark.rdd.RDD
import rx.lang.scala.JavaConversions._

import scala.concurrent.duration.Duration


class SubdocLookupPartition(id: Int, specs: Seq[SubdocLookupSpec], loc: Option[InetAddress])
  extends Partition {
  override def index: Int = id
  def ids: Seq[SubdocLookupSpec] = specs
  def location: Option[InetAddress] = loc
  override def toString = s"SubdocLookupPartition($id, $ids, $loc)"
}


class SubdocLookupRDD(@transient private val sc: SparkContext, specs: Seq[SubdocLookupSpec],
                      bname: String = null, timeout: Option[Duration] = None)
  extends RDD[SubdocLookupResult](sc, Nil) {

  private val cbConfig = CouchbaseConfig(sc.getConf)
  private val bucketName = Option(bname).getOrElse(cbConfig.buckets.head.name)


  override def compute(split: Partition, context: TaskContext): Iterator[SubdocLookupResult] = {
    val p = split.asInstanceOf[SubdocLookupPartition]
    new SubdocLookupAccessor(cbConfig, p.ids, bucketName, timeout).compute()
  }

  override protected def getPartitions: Array[Partition] = {
    val core = CouchbaseConnection().bucket(cbConfig, bucketName).core()

    val req = new GetClusterConfigRequest()
    val config = toScalaObservable(core.send[GetClusterConfigResponse](req))
      .map(c => {
        logWarning(c.config().bucketConfigs().toString)
        logWarning(bucketName)
        c.config().bucketConfig(bucketName)
      })
      .toBlocking
      .single

    val parts = config match {
      case bucketConfig: CouchbaseBucketConfig =>
        val numPartitions = bucketConfig.numberOfPartitions()
        var partitionIndex = 0
        specs.groupBy(spec => {
          val crc32 = new CRC32()
          crc32.update(spec.id.getBytes("UTF-8"))
          val rv = (crc32.getValue >> 16) & 0x7fff
          rv.toInt & numPartitions - 1
        }).map(grouped => {
          val hostname = Some(
            bucketConfig.nodeAtIndex(bucketConfig.nodeIndexForMaster(grouped._1, false)).hostname()
          )
          val currentIdx = partitionIndex
          partitionIndex += 1
          new SubdocLookupPartition(currentIdx, grouped._2,
            Some(InetAddress.getByName(hostname.get)))
        }).toArray
      case _ =>
        logWarning("Memcached preferred locations currently not supported.")
        Array(new SubdocLookupPartition(0, specs, None))
    }

    parts.asInstanceOf[Array[Partition]]
  }

  override protected def getPreferredLocations(split: Partition): Seq[String] = {
    val p = split.asInstanceOf[SubdocLookupPartition]
    if (p.location.isDefined) {
      Seq(p.location.get.getHostName, p.location.get.getHostAddress)
    } else {
      Nil
    }
  }

} 
Example 13
Source File: BinaryFileRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.hadoop.conf.{ Configurable, Configuration }
import org.apache.hadoop.io.Writable
import org.apache.hadoop.mapreduce._
import org.apache.spark.input.StreamFileInputFormat
import org.apache.spark.{ Partition, SparkContext }

private[spark] class BinaryFileRDD[T](
    sc: SparkContext,
    inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
    keyClass: Class[String],
    valueClass: Class[T],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    val conf = getConf
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = newJobContext(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
} 
Example 14
Source File: SubdocMutateRDD.scala    From couchbase-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.couchbase.spark.rdd

import java.net.InetAddress
import java.util.zip.CRC32

import com.couchbase.client.core.config.CouchbaseBucketConfig
import com.couchbase.client.core.message.cluster.{GetClusterConfigRequest, GetClusterConfigResponse}
import com.couchbase.spark.connection._
import org.apache.spark.{Partition, SparkContext, TaskContext}
import org.apache.spark.rdd.RDD
import rx.lang.scala.JavaConversions._

import scala.concurrent.duration.Duration

class SubdocMutationPartition(id: Int, specs: Seq[SubdocMutationSpec], loc: Option[InetAddress])
  extends Partition {
  override def index: Int = id
  def ids: Seq[SubdocMutationSpec] = specs
  def location: Option[InetAddress] = loc
  override def toString = s"SubdocMutatePartition($id, $ids, $loc)"
}

class SubdocMutateRDD(@transient private val sc: SparkContext, specs: Seq[SubdocMutationSpec],
                      bname: String = null, timeout: Option[Duration] = None)
  extends RDD[SubdocMutationResult](sc, Nil) {

  private val cbConfig = CouchbaseConfig(sc.getConf)
  private val bucketName = Option(bname).getOrElse(cbConfig.buckets.head.name)


  override def compute(split: Partition, context: TaskContext): Iterator[SubdocMutationResult] = {
    val p = split.asInstanceOf[SubdocMutationPartition]
    new SubdocMutationAccessor(cbConfig, p.ids, bucketName, timeout).compute()
  }

  override protected def getPartitions: Array[Partition] = {
    val core = CouchbaseConnection().bucket(cbConfig, bucketName).core()

    val req = new GetClusterConfigRequest()
    val config = toScalaObservable(core.send[GetClusterConfigResponse](req))
      .map(c => {
        logWarning(c.config().bucketConfigs().toString)
        logWarning(bucketName)
        c.config().bucketConfig(bucketName)
      })
      .toBlocking
      .single

    val parts = config match {
      case bucketConfig: CouchbaseBucketConfig =>
        val numPartitions = bucketConfig.numberOfPartitions()
        var partitionIndex = 0
        specs.groupBy(spec => {
          val crc32 = new CRC32()
          crc32.update(spec.id.getBytes("UTF-8"))
          val rv = (crc32.getValue >> 16) & 0x7fff
          rv.toInt & numPartitions - 1
        }).map(grouped => {
          val hostname = Some(
            bucketConfig.nodeAtIndex(bucketConfig.nodeIndexForMaster(grouped._1, false)).hostname()
          )
          val currentIdx = partitionIndex
          partitionIndex += 1
          new SubdocMutationPartition(currentIdx, grouped._2,
            Some(InetAddress.getByName(hostname.get)))
        }).toArray
      case _ =>
        logWarning("Memcached preferred locations currently not supported.")
        Array(new SubdocMutationPartition(0, specs, None))
    }

    parts.asInstanceOf[Array[Partition]]
  }

  override protected def getPreferredLocations(split: Partition): Seq[String] = {
    val p = split.asInstanceOf[SubdocMutationPartition]
    if (p.location.isDefined) {
      Seq(p.location.get.getHostName, p.location.get.getHostAddress)
    } else {
      Nil
    }
  }

} 
Example 15
Source File: NewHBaseRDD.scala    From hbase-connectors   with Apache License 2.0 5 votes vote down vote up
package org.apache.hadoop.hbase.spark

import org.apache.hadoop.conf.Configuration
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.hadoop.mapreduce.InputFormat
import org.apache.spark.rdd.NewHadoopRDD
import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}

@InterfaceAudience.Public
class NewHBaseRDD[K,V](@transient val sc : SparkContext,
                       @transient val inputFormatClass: Class[_ <: InputFormat[K, V]],
                       @transient val keyClass: Class[K],
                       @transient val valueClass: Class[V],
                       @transient private val __conf: Configuration,
                       val hBaseContext: HBaseContext) extends NewHadoopRDD(sc, inputFormatClass, keyClass, valueClass, __conf) {

  override def compute(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = {
    hBaseContext.applyCreds()
    super.compute(theSplit, context)
  }
} 
Example 16
Source File: GDBRDD.scala    From spark-gdb   with Apache License 2.0 5 votes vote down vote up
package com.esri.gdb

import org.apache.hadoop.conf.Configuration
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.{Logging, Partition, SparkContext, TaskContext}


case class GDBRDD(@transient sc: SparkContext, gdbPath: String, gdbName: String, numPartitions: Int) extends RDD[Row](sc, Nil) with Logging {

  @DeveloperApi
  override def compute(partition: Partition, context: TaskContext): Iterator[Row] = {
    val part = partition.asInstanceOf[GDBPartition]
    val hadoopConf = if (sc == null) new Configuration() else sc.hadoopConfiguration
    val index = GDBIndex(gdbPath, part.hexName, hadoopConf)
    val table = GDBTable(gdbPath, part.hexName, hadoopConf)
    context.addTaskCompletionListener(context => {
      table.close()
      index.close()
    })
    table.rowIterator(index, part.startAtRow, part.numRowsToRead)
  }

  override protected def getPartitions: Array[Partition] = {
    val hadoopConf = if (sc == null) new Configuration() else sc.hadoopConfiguration
    GDBTable.findTable(gdbPath, gdbName, hadoopConf) match {
      case Some(catTab) => {
        val index = GDBIndex(gdbPath, catTab.hexName, hadoopConf)
        try {
          val numRows = index.numRows
          val numRowsPerPartition = (numRows.toDouble / numPartitions).ceil.toInt
          var startAtRow = 0
          (0 until numPartitions).map(i => {
            val endAtRow = startAtRow + numRowsPerPartition
            val numRowsToRead = if (endAtRow <= numRows) numRowsPerPartition else numRows - startAtRow
            val gdbPartition = GDBPartition(i, catTab.hexName, startAtRow, numRowsToRead)
            startAtRow += numRowsToRead
            gdbPartition
          }).toArray
        } finally {
          index.close()
        }
      }
      case _ => {
        log.error(s"Cannot find '$gdbName' in $gdbPath, creating an empty array of Partitions !")
        Array.empty[Partition]
      }
    }
  }
}

private[this] case class GDBPartition(m_index: Int,
                                      val hexName: String,
                                      val startAtRow: Int,
                                      val numRowsToRead: Int
                                     ) extends Partition {
  override def index = m_index
} 
Example 17
Source File: RDDMatcher.scala    From piglet   with Apache License 2.0 5 votes vote down vote up
package dbis.piglet.cep.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, TaskContext}
import scala.reflect.ClassTag
import dbis.piglet.cep.nfa.NFAController
import dbis.piglet.cep.engines._
import dbis.piglet.cep.ops.SelectionStrategy._
import dbis.piglet.cep.ops.OutputStrategy._
import dbis.piglet.backends.{SchemaClass => Event}
import dbis.piglet.cep.ops.MatchCollector
import dbis.piglet.cep.ops.SelectionStrategy

class RDDMatcher[T <: Event: ClassTag](parent: RDD[T], nfa: NFAController[T], sstr: SelectionStrategy = SelectionStrategy.FirstMatch, out: OutputStrategy = Combined) extends RDD[T](parent){
   val collector: MatchCollector[T] = new MatchCollector()
   val engine: CEPEngine[T] = sstr match {
    case SelectionStrategy.FirstMatch        => new FirstMatch(nfa, collector)
    case SelectionStrategy.AllMatches        => new AnyMatch(nfa, collector)
    case SelectionStrategy.NextMatches       => new NextMatch(nfa, collector)
    case SelectionStrategy.ContiguityMatches => new ContiguityMatch(nfa, collector)
    case _                                   => throw new Exception("The Strategy is not supported")

  }
  override def compute(split: Partition, context: TaskContext): Iterator[T] =  {
    firstParent[T].iterator(split, context).foreach (event => engine.runEngine(event))
    collector.convertEventsToArray().iterator
  }
  

  override protected def getPartitions: Array[Partition] = firstParent[Event].partitions
} 
Example 18
Source File: Neo4jRDD.scala    From neo4j-spark-connector   with Apache License 2.0 5 votes vote down vote up
package org.neo4j.spark.rdd

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.{Partition, SparkContext, TaskContext}
import org.neo4j.spark.{Executor, Neo4jConfig, Partitions}

class Neo4jRDD(@transient sc: SparkContext, val query: String, val parameters: Map[String, Any] = Map.empty, partitions: Partitions = Partitions())
  extends RDD[Row](sc, Nil) {

  val neo4jConfig = Neo4jConfig(sc.getConf)

  override def compute(partition: Partition, context: TaskContext): Iterator[Row] = {
    val neo4jPartition: Neo4jPartition = partition.asInstanceOf[Neo4jPartition]

    Executor.execute(neo4jConfig, query, parameters ++ neo4jPartition.window).sparkRows
  }

  override protected def getPartitions: Array[Partition] = {
    val p = partitions.effective()
    Range(0, p.partitions.toInt).map(idx => new Neo4jPartition(idx, p.skip(idx), p.limit(idx))).toArray
  }

  override def toString(): String = s"Neo4jRDD partitions $partitions $query using $parameters"
} 
Example 19
Source File: PushDownJdbcRDD.scala    From gimel   with Apache License 2.0 5 votes vote down vote up
package com.paypal.gimel.jdbc.utilities

import java.sql.{Connection, ResultSet}

import org.apache.spark.{Partition, SparkContext, TaskContext}
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.sql.Row

import com.paypal.gimel.common.utilities.GenericUtils
import com.paypal.gimel.logger.Logger


class PushDownJdbcRDD(sc: SparkContext,
                      getConnection: () => Connection,
                      sql: String,
                      mapRow: ResultSet => Row = PushDownJdbcRDD.resultSetToRow)
  extends JdbcRDD[Row](sc, getConnection, sql, 0, 100, 1, mapRow)
    with Logging {

  override def compute(thePart: Partition,
                       context: TaskContext): Iterator[Row] = {
    val logger = Logger(this.getClass.getName)
    val functionName = s"[QueryHash: ${sql.hashCode}]"
    logger.info(s"Proceeding to execute push down query $functionName: $sql")
    val queryResult: String = GenericUtils.time(functionName, Some(logger)) {
      JDBCConnectionUtility.withResources(getConnection()) { connection =>
        JdbcAuxiliaryUtilities.executeQueryAndReturnResultString(
          sql,
          connection
        )
      }
    }
    Seq(Row(queryResult)).iterator
  }
}

object PushDownJdbcRDD {
  def resultSetToRow(rs: ResultSet): Row = {
    Row(rs.getString(0))
  }
} 
Example 20
Source File: FilteredQueryInfinispanRDD.scala    From infinispan-spark   with Apache License 2.0 5 votes vote down vote up
package org.infinispan.spark.rdd

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, TaskContext}
import org.infinispan.client.hotrod.Search
import org.infinispan.spark._


class FilteredQueryInfinispanRDD[K, V, R](parent: InfinispanRDD[K, V], filter: QueryFilter) extends RDD[(K, R)](parent.sc, Nil) {

   override def count(): Long = filter match {
      case f: StringQueryFilter =>
         val cache = getCache(parent.configuration, parent.remoteCacheManager)
         Search.getQueryFactory(cache).create(f.queryString).getResultSize
      case f: QueryObjectFilter => f.query.getResultSize
   }


   @DeveloperApi
   override def compute(split: Partition, context: TaskContext): Iterator[(K, R)] = parent.compute(
      split,
      context,
      (a, p) => RemoteCacheManagerBuilder.create(p, a),
      filter.name,
      filter.params.toArray
   )

   override protected def getPartitions: Array[Partition] = parent.getPartitions

} 
Example 21
Source File: Splitter.scala    From infinispan-spark   with Apache License 2.0 5 votes vote down vote up
package org.infinispan.spark.rdd

import java.net.SocketAddress
import java.util

import org.apache.spark.Partition
import org.infinispan.client.hotrod.CacheTopologyInfo
import org.infinispan.spark.config.ConnectorConfiguration

import scala.collection.JavaConverters._
import scala.collection.mutable



class PerServerSplitter extends Splitter {

   override def split(cacheTopology: CacheTopologyInfo, properties: ConnectorConfiguration): Array[Partition] = {
      val segmentsByServer = cacheTopology.getSegmentsPerServer
      if (segmentsByServer.isEmpty) throw new IllegalArgumentException("No servers found to partition")
      if (segmentsByServer.keySet().size == 1 && segmentsByServer.values().asScala.flatten(_.asScala).isEmpty) {
         Array(new SingleServerPartition(segmentsByServer.keySet.asScala.head, properties))
      } else {
         val segmentsByServerSeq = segmentsByServer.asScala.toStream.sortBy { case (_, v) => v.size }
         val segments = segmentsByServerSeq.flatMap { case (_, segs) => segs.asScala.toSeq }.distinct

         val numServers = segmentsByServerSeq.size
         val numSegments = segments.size
         val segmentsPerServer = Math.ceil(numSegments.toFloat / numServers.toFloat).toInt

         val q = mutable.Queue(segments: _*)
         val segmentsByServerIterator = Iterator.continually(segmentsByServerSeq).flatten
         val result = new mutable.HashMap[SocketAddress, collection.mutable.Set[Integer]] with mutable.MultiMap[SocketAddress, Integer]
         while (q.nonEmpty) {
            val (server, segments) = segmentsByServerIterator.next()
            val split = List.fill(segmentsPerServer) {
               q.dequeueFirst(segments.contains)
            }.flatten
            if (split.nonEmpty) {
               split.foreach {
                  result.addBinding(server, _)
               }
            }
         }

         val pps = properties.getServerPartitions
         result.toStream.flatMap { case (a, b) => cut(b.toSeq, pps).map((a, _)) }.zipWithIndex.map { case ((server, segs), idx) =>
            new InfinispanPartition(idx, Location(server), toJavaSet(segs), properties)
         }.toArray
      }
   }

   private def toJavaSet(s: Set[Integer]) = new util.HashSet[Integer](s.asJava)

   private def cut[A](l: Seq[A], parts: Int) = (0 until parts).map { i => l.drop(i).sliding(1, parts).flatten.toSet }.filter(_.nonEmpty)

} 
Example 22
Source File: RabbitMQPartition.scala    From spark-rabbitmq   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.rabbitmq.distributed

import org.apache.spark.Partition
import org.apache.spark.streaming.rabbitmq.models.ExchangeAndRouting

private[rabbitmq]
class RabbitMQPartition(
                         val index: Int,
                         val queue: String,
                         val exchangeAndRouting: ExchangeAndRouting,
                         val connectionParams: Map[String, String],
                         val withFairDispatch: Boolean
                       ) extends Partition {

  override def toString: String = s"${index.toString},$queue, ${exchangeAndRouting.toString()}," +
    s" ${connectionParams.mkString(" , ")}, ${withFairDispatch.toString}"

  def toStringPretty: String = s"queue: $queue, ${exchangeAndRouting.toStringPretty()}, connectionParams: " +
    s"${connectionParams.mkString(" , ")}, withFairDispatch: $withFairDispatch"
} 
Example 23
Source File: MongodbRDD.scala    From Spark-MongoDB   with Apache License 2.0 5 votes vote down vote up
package com.stratio.datasource.mongodb.rdd

import com.mongodb.casbah.Imports._
import com.stratio.datasource.mongodb.partitioner.{MongodbPartition, MongodbPartitioner}
import com.stratio.datasource.util.Config
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{Partition, TaskContext}

import com.stratio.datasource.mongodb.query.{NoFilters, FilterSection}


class MongodbRDD(
  sc: SQLContext,
  config: Config,
  partitioner: MongodbPartitioner,
  requiredColumns: Array[String] = Array(),
  filters: FilterSection = NoFilters)
  extends RDD[DBObject](sc.sparkContext, deps = Nil) {

  override def getPartitions: Array[Partition] =
    partitioner.computePartitions().asInstanceOf[Array[Partition]]

  override def getPreferredLocations(split: Partition): Seq[String] =
    split.asInstanceOf[MongodbPartition].hosts.map(new ServerAddress(_).getHost)

  override def compute(
    split: Partition,
    context: TaskContext): MongodbRDDIterator =
    new MongodbRDDIterator(
      context,
      split.asInstanceOf[MongodbPartition],
      config,
      requiredColumns,
      filters)

} 
Example 24
Source File: PartitionPruningRDDSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
} 
Example 25
Source File: SnowflakeRDD.scala    From spark-snowflake   with Apache License 2.0 5 votes vote down vote up
package net.snowflake.spark.snowflake.io

import java.io.InputStream

import net.snowflake.spark.snowflake.io.SupportedFormat.SupportedFormat
import org.apache.spark.{Partition, SparkContext, TaskContext}
import org.apache.spark.rdd.RDD

class SnowflakeRDD(sc: SparkContext,
                   fileNames: List[String],
                   format: SupportedFormat,
                   downloadFile: String => InputStream,
                   expectedPartitionCount: Int)
    extends RDD[String](sc, Nil) {

  @transient private val MIN_FILES_PER_PARTITION = 2
  @transient private val MAX_FILES_PER_PARTITION = 10

  override def compute(split: Partition,
                       context: TaskContext): Iterator[String] = {
    val snowflakePartition = split.asInstanceOf[SnowflakePartition]

    val stringIterator = new SFRecordReader(format, snowflakePartition.index)
    stringIterator.setDownloadFunction(downloadFile)

    snowflakePartition.fileNames.foreach(name => {
      stringIterator.addFileName(name)
    })

    logger.info(
      s"""${SnowflakeResultSetRDD.WORKER_LOG_PREFIX}: Start reading
         | partition ID:${snowflakePartition.index}
         | totalFileCount=${snowflakePartition.fileNames.size}
         |""".stripMargin.filter(_ >= ' '))

    stringIterator
  }

  override protected def getPartitions: Array[Partition] = {
    var fileCountPerPartition =
      Math.max(
        MIN_FILES_PER_PARTITION,
        (fileNames.length + expectedPartitionCount / 2) / expectedPartitionCount
      )
    fileCountPerPartition = Math.min(MAX_FILES_PER_PARTITION, fileCountPerPartition)
    val fileCount = fileNames.length
    val partitionCount = (fileCount + fileCountPerPartition - 1) / fileCountPerPartition
    logger.info(s"""${SnowflakeResultSetRDD.MASTER_LOG_PREFIX}: Total statistics:
         | fileCount=$fileCount filePerPartition=$fileCountPerPartition
         | actualPartitionCount=$partitionCount
         | expectedPartitionCount=$expectedPartitionCount
         |""".stripMargin.filter(_ >= ' '))

    if (fileNames.nonEmpty) {
      fileNames
        .grouped(fileCountPerPartition)
        .zipWithIndex
        .map {
          case (names, index) => SnowflakePartition(names, id, index)
        }
        .toArray
    } else {
      // If the result set is empty, put one empty partition to the array.
      Seq[SnowflakePartition]{SnowflakePartition(fileNames, 0, 0)}.toArray
    }
  }

}

private case class SnowflakePartition(fileNames: List[String],
                                      rddId: Int,
                                      index: Int)
    extends Partition {

  override def hashCode(): Int = 31 * (31 + rddId) + index

  override def equals(other: Any): Boolean = super.equals(other)
} 
Example 26
Source File: SlidingRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.rdd

import scala.collection.mutable
import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD

private[mllib]
class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T], val offset: Int)
  extends Partition with Serializable {
  override val index: Int = idx
}


private[mllib]
class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int)
  extends RDD[Array[T]](parent) {

  require(windowSize > 0 && step > 0 && !(windowSize == 1 && step == 1),
    "Window size and step must be greater than 0, " +
      s"and they cannot be both 1, but got windowSize = $windowSize and step = $step.")

  override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = {
    val part = split.asInstanceOf[SlidingRDDPartition[T]]
    (firstParent[T].iterator(part.prev, context) ++ part.tail)
      .drop(part.offset)
      .sliding(windowSize, step)
      .withPartial(false)
      .map(_.toArray)
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev)

  override def getPartitions: Array[Partition] = {
    val parentPartitions = parent.partitions
    val n = parentPartitions.length
    if (n == 0) {
      Array.empty
    } else if (n == 1) {
      Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty, 0))
    } else {
      val w1 = windowSize - 1
      // Get partition sizes and first w1 elements.
      val (sizes, heads) = parent.mapPartitions { iter =>
        val w1Array = iter.take(w1).toArray
        Iterator.single((w1Array.length + iter.length, w1Array))
      }.collect().unzip
      val partitions = mutable.ArrayBuffer.empty[SlidingRDDPartition[T]]
      var i = 0
      var cumSize = 0
      var partitionIndex = 0
      while (i < n) {
        val mod = cumSize % step
        val offset = if (mod == 0) 0 else step - mod
        val size = sizes(i)
        if (offset < size) {
          val tail = mutable.ListBuffer.empty[T]
          // Keep appending to the current tail until it has w1 elements.
          var j = i + 1
          while (j < n && tail.length < w1) {
            tail ++= heads(j).take(w1 - tail.length)
            j += 1
          }
          if (sizes(i) + tail.length >= offset + windowSize) {
            partitions +=
              new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail, offset)
            partitionIndex += 1
          }
        }
        cumSize += size
        i += 1
      }
      partitions.toArray
    }
  }

  // TODO: Override methods such as aggregate, which only requires one Spark job.
} 
Example 27
Source File: StateStoreRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.state

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.internal.SessionState
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.SerializableConfiguration


class StateStoreRDD[T: ClassTag, U: ClassTag](
    dataRDD: RDD[T],
    storeUpdateFunction: (StateStore, Iterator[T]) => Iterator[U],
    checkpointLocation: String,
    operatorId: Long,
    storeVersion: Long,
    keySchema: StructType,
    valueSchema: StructType,
    sessionState: SessionState,
    @transient private val storeCoordinator: Option[StateStoreCoordinatorRef])
  extends RDD[U](dataRDD) {

  private val storeConf = new StateStoreConf(sessionState.conf)

  // A Hadoop Configuration can be about 10 KB, which is pretty big, so broadcast it
  private val confBroadcast = dataRDD.context.broadcast(
    new SerializableConfiguration(sessionState.newHadoopConf()))

  override protected def getPartitions: Array[Partition] = dataRDD.partitions

  override def getPreferredLocations(partition: Partition): Seq[String] = {
    val storeId = StateStoreId(checkpointLocation, operatorId, partition.index)
    storeCoordinator.flatMap(_.getLocation(storeId)).toSeq
  }

  override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = {
    var store: StateStore = null
    val storeId = StateStoreId(checkpointLocation, operatorId, partition.index)
    store = StateStore.get(
      storeId, keySchema, valueSchema, storeVersion, storeConf, confBroadcast.value.value)
    val inputIter = dataRDD.iterator(partition, ctxt)
    storeUpdateFunction(store, inputIter)
  }
} 
Example 28
Source File: WholeTextFileRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.{Text, Writable}
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.task.JobContextImpl

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.WholeTextFileInputFormat


private[spark] class WholeTextFileRDD(
    sc : SparkContext,
    inputFormatClass: Class[_ <: WholeTextFileInputFormat],
    keyClass: Class[Text],
    valueClass: Class[Text],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    val conf = getConf
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = new JobContextImpl(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
} 
Example 29
Source File: SubtractedRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.Dependency
import org.apache.spark.OneToOneDependency
import org.apache.spark.Partition
import org.apache.spark.Partitioner
import org.apache.spark.ShuffleDependency
import org.apache.spark.SparkEnv
import org.apache.spark.TaskContext


private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag](
    @transient var rdd1: RDD[_ <: Product2[K, V]],
    @transient var rdd2: RDD[_ <: Product2[K, W]],
    part: Partitioner)
  extends RDD[(K, V)](rdd1.context, Nil) {


  override def getDependencies: Seq[Dependency[_]] = {
    def rddDependency[T1: ClassTag, T2: ClassTag](rdd: RDD[_ <: Product2[T1, T2]])
      : Dependency[_] = {
      if (rdd.partitioner == Some(part)) {
        logDebug("Adding one-to-one dependency with " + rdd)
        new OneToOneDependency(rdd)
      } else {
        logDebug("Adding shuffle dependency with " + rdd)
        new ShuffleDependency[T1, T2, Any](rdd, part)
      }
    }
    Seq(rddDependency[K, V](rdd1), rddDependency[K, W](rdd2))
  }

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](part.numPartitions)
    for (i <- 0 until array.length) {
      // Each CoGroupPartition will depend on rdd1 and rdd2
      array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) =>
        dependencies(j) match {
          case s: ShuffleDependency[_, _, _] =>
            None
          case _ =>
            Some(new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i)))
        }
      }.toArray)
    }
    array
  }

  override val partitioner = Some(part)

  override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = {
    val partition = p.asInstanceOf[CoGroupPartition]
    val map = new JHashMap[K, ArrayBuffer[V]]
    def getSeq(k: K): ArrayBuffer[V] = {
      val seq = map.get(k)
      if (seq != null) {
        seq
      } else {
        val seq = new ArrayBuffer[V]()
        map.put(k, seq)
        seq
      }
    }
    def integrate(depNum: Int, op: Product2[K, V] => Unit): Unit = {
      dependencies(depNum) match {
        case oneToOneDependency: OneToOneDependency[_] =>
          val dependencyPartition = partition.narrowDeps(depNum).get.split
          oneToOneDependency.rdd.iterator(dependencyPartition, context)
            .asInstanceOf[Iterator[Product2[K, V]]].foreach(op)

        case shuffleDependency: ShuffleDependency[_, _, _] =>
          val iter = SparkEnv.get.shuffleManager
            .getReader(
              shuffleDependency.shuffleHandle, partition.index, partition.index + 1, context)
            .read()
          iter.foreach(op)
      }
    }

    // the first dep is rdd1; add all values to the map
    integrate(0, t => getSeq(t._1) += t._2)
    // the second dep is rdd2; remove all of its keys
    integrate(1, t => map.remove(t._1))
    map.asScala.iterator.map(t => t._2.iterator.map((t._1, _))).flatten
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
  }

} 
Example 30
Source File: ZippedWithIndexRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.Utils

private[spark]
class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


  @transient private val startIndices: Array[Long] = {
    val n = prev.partitions.length
    if (n == 0) {
      Array.empty
    } else if (n == 1) {
      Array(0L)
    } else {
      prev.context.runJob(
        prev,
        Utils.getIteratorSize _,
        0 until n - 1 // do not need to count the last partition
      ).scanLeft(0L)(_ + _)
    }
  }

  override def getPartitions: Array[Partition] = {
    firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index)))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = {
    val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition]
    val parentIter = firstParent[T].iterator(split.prev, context)
    Utils.getIteratorZipWithIndex(parentIter, split.startIndex)
  }
} 
Example 31
Source File: UnionRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 32
Source File: PartitionwiseSampledRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 33
Source File: PartitionerAwareUnionRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.nonEmpty)
  require(rdds.forall(_.partitioner.isDefined))
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map { index =>
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }.toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) =>
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
    }
    val location = if (locations.isEmpty) {
      None
    } else {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 34
Source File: BinaryFileRDD.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.Writable
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.task.JobContextImpl

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.StreamFileInputFormat

private[spark] class BinaryFileRDD[T](
    sc: SparkContext,
    inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
    keyClass: Class[String],
    valueClass: Class[T],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    val conf = getConf
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = new JobContextImpl(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
} 
Example 35
Source File: PartitionPruningRDDSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
} 
Example 36
Source File: RiakKeysPartitioner.scala    From spark-riak-connector   with Apache License 2.0 5 votes vote down vote up
package com.basho.riak.spark.rdd.partitioner

import com.basho.riak.client.core.util.HostAndPort
import com.basho.riak.spark.query.QueryData
import com.basho.riak.spark.rdd.{ RiakPartition, ReadConf }
import org.apache.spark.Partition
import java.math.BigInteger

case class RiakKeysPartition[K](
  index: Int,
  endpoints: Iterable[HostAndPort],
  keys: QueryData[K]) extends RiakPartition

object RiakKeysPartitioner {
  def partitions[K](endpoints: Iterable[HostAndPort], readConf: ReadConf, riakKeys: QueryData[K]): Array[Partition] = {
    riakKeys.keysOrRange match {
      case Some(Left(keys)) =>
        Array(new RiakKeysPartition[K](0, endpoints, riakKeys))

      case Some(Right(ranges: Seq[(K, Option[K])])) =>
        ranges match {
          case (from, to) +: Seq() => {
            val splitRanges = splitRangeIntoSubranges(from, to, readConf.getOrDefaultSplitCount())
            partitionPerRange(splitRanges, endpoints, riakKeys.index)
          }
          case _ => partitionPerRange(ranges, endpoints, riakKeys.index)
        }
    }
  }

  def partitionPerRange[K](ranges: Seq[(K, Option[K])], endpoints: Iterable[HostAndPort], index: Option[String]): Array[Partition] = {
    ranges.zipWithIndex.map {
      case (range, indx) => new RiakKeysPartition[K](indx, endpoints, new QueryData[K](Some(Right(Seq(range))), index))
    }.toArray
  }

  // TODO: move to PartitionUtils
  def calculateRanges[T: Integral](from: T, to: T, splitCount: Int)(implicit num: Integral[T]): Seq[(T, T)] = {
    import num._
    val diff = (to - from) / num.fromInt(splitCount - 1)
    val partitionsCount = if (diff == 0) 1 else splitCount
    val start = (0 to (partitionsCount - 1)).map(x => from + num.fromInt(x) * diff) 
    val end = start.tail.map(_ - num.fromInt(1)) :+ to
    start zip end
  }

  def splitRangeIntoSubranges[K](from: K, to: Option[K], splitCount: Int): Seq[(K, Option[K])] = {
    to match {
      case Some(rangeEnd: Int) => from match {
        case rangeStart: Int => {
          calculateRanges(rangeStart, rangeEnd, splitCount).map(r => (r._1.asInstanceOf[K], Some(r._2.asInstanceOf[K])))
        }
        case _ => throw new IllegalArgumentException
      }
      case Some(rangeEnd: Long) => from match {
        case rangeStart: Long => {
          calculateRanges(rangeStart, rangeEnd, splitCount).map(r => (r._1.asInstanceOf[K], Some(r._2.asInstanceOf[K])))
        }
        case _ => throw new IllegalArgumentException
      }
      case Some(rangeEnd: BigInt) => from match {
        case rangeStart: BigInt => {
          calculateRanges(rangeStart, rangeEnd, splitCount).map(r => (r._1.asInstanceOf[K], Some(r._2.asInstanceOf[K])))
        }
        case _ => throw new IllegalArgumentException
      }
      case _ => Seq((from, to))
    }
  }
} 
Example 37
Source File: Partitioner.scala    From spark-solr   with Apache License 2.0 5 votes vote down vote up
package com.lucidworks.spark

import java.net.InetAddress

import com.lucidworks.spark.rdd.SolrRDD
import com.lucidworks.spark.util.SolrSupport
import org.apache.solr.client.solrj.SolrQuery
import org.apache.spark.Partition

import scala.collection.mutable.ArrayBuffer

// Is there a need to override {@code Partitioner.scala} and define our own partition id's
object SolrPartitioner {

  def getShardPartitions(shards: List[SolrShard], query: SolrQuery) : Array[Partition] = {
    shards.zipWithIndex.map{ case (shard, i) =>
      // Chose any of the replicas as the active shard to query
      SelectSolrRDDPartition(i, "*", shard, query, SolrRDD.randomReplica(shard))}.toArray
  }

  def getSplitPartitions(
      shards: List[SolrShard],
      query: SolrQuery,
      splitFieldName: String,
      splitsPerShard: Int): Array[Partition] = {
    var splitPartitions = ArrayBuffer.empty[SelectSolrRDDPartition]
    var counter = 0
    shards.foreach(shard => {
      val splits = SolrSupport.getShardSplits(query, shard, splitFieldName, splitsPerShard)
      splits.foreach(split => {
        splitPartitions += SelectSolrRDDPartition(counter, "*", shard, split.query, split.replica)
        counter = counter + 1
      })
    })
    splitPartitions.toArray
  }

  // Workaround for SOLR-10490. TODO: Remove once fixed
  def getExportHandlerPartitions(
      shards: List[SolrShard],
      query: SolrQuery): Array[Partition] = {
    shards.zipWithIndex.map{ case (shard, i) =>
      // Chose any of the replicas as the active shard to query
      ExportHandlerPartition(i, shard, query, SolrRDD.randomReplica(shard), 0, 0)}.toArray
  }

  // Workaround for SOLR-10490. TODO: Remove once fixed
  def getExportHandlerPartitions(
      shards: List[SolrShard],
      query: SolrQuery,
      splitFieldName: String,
      splitsPerShard: Int): Array[Partition] = {
    val splitPartitions = ArrayBuffer.empty[ExportHandlerPartition]
    var counter = 0
    shards.foreach(shard => {
      // Form a continuous iterator list so that we can pick different replicas for different partitions in round-robin mode
      val splits = SolrSupport.getExportHandlerSplits(query, shard, splitFieldName, splitsPerShard)
      splits.foreach(split => {
        splitPartitions += ExportHandlerPartition(counter, shard, split.query, split.replica, split.numWorkers, split.workerId)
        counter = counter+1
      })
    })
    splitPartitions.toArray
  }

}

case class SolrShard(shardName: String, replicas: List[SolrReplica])

case class SolrReplica(
    replicaNumber: Int,
    replicaName: String,
    replicaUrl: String,
    replicaHostName: String,
    locations: Array[InetAddress]) {
  def getHostAndPort(): String = {replicaHostName.substring(0, replicaHostName.indexOf('_'))}
  override def toString(): String = {
    return s"SolrReplica(${replicaNumber}) ${replicaName}: url=${replicaUrl}, hostName=${replicaHostName}, locations="+locations.mkString(",")
  }
} 
Example 38
Source File: SolrRDDPartition.scala    From spark-solr   with Apache License 2.0 5 votes vote down vote up
package com.lucidworks.spark

import org.apache.solr.client.solrj.SolrQuery
import org.apache.solr.common.params.SolrParams
import org.apache.spark.Partition

trait SolrRDDPartition extends Partition {
  val solrShard: SolrShard
  val query: SolrQuery
  var preferredReplica: SolrReplica // Preferred replica to query
}

case class CloudStreamPartition(
    index: Int,
    zkhost:String,
    collection:String,
    params: SolrParams)
  extends Partition

case class SelectSolrRDDPartition(
    index: Int,
    cursorMark: String,
    solrShard: SolrShard,
    query: SolrQuery,
    var preferredReplica: SolrReplica)
  extends SolrRDDPartition

case class ExportHandlerPartition(
    index: Int,
    solrShard: SolrShard,
    query: SolrQuery,
    var preferredReplica: SolrReplica,
    numWorkers: Int,
    workerId: Int)
  extends SolrRDDPartition

case class SolrLimitPartition(
    index: Int = 0,
    zkhost:String,
    collection:String,
    maxRows: Int,
    query: SolrQuery)
  extends Partition 
Example 39
Source File: HBasePartition.scala    From Backup-Repo   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.regionserver.RegionScanner
import org.apache.spark.{Logging, Partition}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.hbase.catalyst.expressions.PartialPredicateOperations._
import org.apache.spark.sql.hbase.types.{HBaseBytesType, Range}


private[hbase] class HBasePartition(
                                     val idx: Int, val mappedIndex: Int,
                                     start: Option[HBaseRawType] = None,
                                     end: Option[HBaseRawType] = None,
                                     val server: Option[String] = None,
                                     val filterPredicates: Option[Expression] = None,
                                     @transient relation: HBaseRelation = null,
                                     @transient val newScanner:RegionScanner = null)
  extends Range[HBaseRawType](start, true, end, false, HBaseBytesType)
  with Partition with IndexMappable with Logging {

  override def index: Int = idx

  override def hashCode(): Int = idx

  @transient lazy val startNative: Seq[Any] = relation.nativeKeyConvert(start)

  @transient lazy val endNative: Seq[Any] = relation.nativeKeyConvert(end)

  
  def computePredicate(relation: HBaseRelation): Option[Expression] = {
    val predicate = if (filterPredicates.isDefined &&
      filterPredicates.get.references.exists(_.exprId == relation.partitionKeys.head.exprId)) {
      val oriPredicate = filterPredicates.get
      val predicateReferences = oriPredicate.references.toSeq
      val boundReference = BindReferences.bindReference(oriPredicate, predicateReferences)
      val row = new GenericMutableRow(predicateReferences.size)
      var rowIndex = 0
      var i = 0
      var range: Range[_] = null
      while (i < relation.keyColumns.size) {
        range = relation.generateRange(this, oriPredicate, i)
        if (range != null) {
          rowIndex = relation.rowIndex(predicateReferences, i)
          if (rowIndex >= 0) row.update(rowIndex, range)
          // if the non-last dimension range is not point, do not proceed to the next dims
          if (i < relation.keyColumns.size - 1 && !range.isPoint) i = relation.keyColumns.size
          else i = i + 1
        } else i = relation.keyColumns.size
      }
      val pr = boundReference.partialReduce(row, predicateReferences)
      pr match {
        case (null, e: Expression) => Some(e)
        case (true, _) => None
        case (false, _) => Some(Literal(false))
      }
    } else filterPredicates
    logInfo(predicate.toString)
    predicate
  }

  override def toString = {
    s"HBasePartition: $idx, $mappedIndex, [$start, $end), $filterPredicates"
  }
} 
Example 40
Source File: PartitionsIterator.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd

import grizzled.slf4j.Logger

import org.apache.spark.rdd.RDD
import org.apache.spark.{ Partition, TaskContext }

protected[flint] object PartitionsIterator {
  val logger = Logger(PartitionsIterator.getClass)

  def apply[T](
    rdd: RDD[T],
    partitions: Seq[Partition],
    context: TaskContext,
    preservesPartitionsOrdering: Boolean = false // FIXME: This is a band-aid which should be fixed.
  ): PartitionsIterator[T] = new PartitionsIterator(rdd, partitions, context, preservesPartitionsOrdering)
}


  def headPartitionIndex: Int = curPart.index
} 
Example 41
Source File: ParallelCollectionRDD.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd

import org.apache.spark.rdd.RDD
import org.apache.spark.{ Partition, SparkContext, TaskContext }

import scala.reflect.ClassTag



case class ParallelCollectionRDDPartition[T: ClassTag](
  override val index: Int,
  values: Seq[T]
) extends Partition

class ParallelCollectionRDD[T: ClassTag](
  sc: SparkContext,
  @transient data: Seq[Seq[T]]
) extends RDD[T](sc, Nil) {
  override def compute(split: Partition, context: TaskContext): Iterator[T] =
    split.asInstanceOf[ParallelCollectionRDDPartition[T]].values.iterator

  override protected def getPartitions: Array[Partition] =
    data.zipWithIndex.map {
      case (d, index) =>
        ParallelCollectionRDDPartition(index, d)
    }.toArray
} 
Example 42
Source File: RangeMergeJoinSpec.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd

import com.twosigma.flint.rdd.function.join.RangeMergeJoin
import org.apache.spark.Partition
import org.scalatest.FlatSpec

class RangeMergeJoinSpec extends FlatSpec {
  val thisSplits = IndexedSeq(
    RangeSplit(Split(0), CloseOpen(1, Some(2))),
    RangeSplit(Split(1), CloseOpen(2, Some(3))),
    RangeSplit(Split(2), CloseOpen(3, Some(4))),
    RangeSplit(Split(3), CloseOpen(4, Some(5))),
    RangeSplit(Split(4), CloseOpen(5, None))
  )

  val thatSplits = IndexedSeq(
    RangeSplit(Split(0), CloseOpen(1, Some(3))),
    RangeSplit(Split(1), CloseOpen(3, Some(7))),
    RangeSplit(Split(2), CloseOpen(7, None))
  )

  "The RangeMergeJoin" should "`mergeSplits` with no tolerance correctly" in {
    val benchmark = List(
      RangeMergeJoin(
        CloseOpen(1, Some(2)),
        List(RangeSplit(Split(0), CloseOpen(1, Some(2)))),
        List(RangeSplit(Split(0), CloseOpen(1, Some(3))))
      ),
      RangeMergeJoin(
        CloseOpen(2, Some(3)),
        List(RangeSplit(Split(1), CloseOpen(2, Some(3)))),
        List(RangeSplit(Split(0), CloseOpen(1, Some(3))))
      ),
      RangeMergeJoin(
        CloseOpen(3, Some(4)),
        List(RangeSplit(Split(2), CloseOpen(3, Some(4)))),
        List(RangeSplit(Split(1), CloseOpen(3, Some(7))))
      ),
      RangeMergeJoin(
        CloseOpen(4, Some(5)),
        List(RangeSplit(Split(3), CloseOpen(4, Some(5)))),
        List(RangeSplit(Split(1), CloseOpen(3, Some(7))))
      ),
      RangeMergeJoin(
        CloseOpen(5, Some(7)),
        List(RangeSplit(Split(4), CloseOpen(5, None))),
        List(RangeSplit(Split(1), CloseOpen(3, Some(7))))
      ),
      RangeMergeJoin(
        CloseOpen(7, None),
        List(RangeSplit(Split(4), CloseOpen(5, None))),
        List(RangeSplit(Split(2), CloseOpen(7, None)))
      )
    )
    assertResult(benchmark) { RangeMergeJoin.mergeSplits(thisSplits, thatSplits) }
  }

  it should "`mergeSplits` with some tolerance correctly" in {
    val benchmark = List(
      RangeMergeJoin(
        CloseOpen(1, Some(2)),
        List(RangeSplit(Split(0), CloseOpen(1, Some(2)))),
        List(RangeSplit(Split(0), CloseOpen(1, Some(3))))
      ),
      RangeMergeJoin(
        CloseOpen(2, Some(3)),
        List(RangeSplit(Split(0), CloseOpen(1, Some(2))), RangeSplit(Split(1), CloseOpen(2, Some(3)))),
        List(RangeSplit(Split(0), CloseOpen(1, Some(3))))
      ),
      RangeMergeJoin(
        CloseOpen(3, Some(4)),
        List(RangeSplit(Split(1), CloseOpen(2, Some(3))), RangeSplit(Split(2), CloseOpen(3, Some(4)))),
        List(RangeSplit(Split(0), CloseOpen(1, Some(3))), RangeSplit(Split(1), CloseOpen(3, Some(7))))
      ),
      RangeMergeJoin(
        CloseOpen(4, Some(5)),
        List(RangeSplit(Split(2), CloseOpen(3, Some(4))), RangeSplit(Split(3), CloseOpen(4, Some(5)))),
        List(RangeSplit(Split(1), CloseOpen(3, Some(7))))
      ),
      RangeMergeJoin(
        CloseOpen(5, Some(7)),
        List(RangeSplit(Split(3), CloseOpen(4, Some(5))), RangeSplit(Split(4), CloseOpen(5, None))),
        List(RangeSplit(Split(1), CloseOpen(3, Some(7))))
      ),
      RangeMergeJoin(
        CloseOpen(7, None),
        List(RangeSplit(Split(4), CloseOpen(5, None))),
        List(RangeSplit(Split(1), CloseOpen(3, Some(7))), RangeSplit(Split(2), CloseOpen(7, None)))
      )
    )

    assertResult(benchmark) { RangeMergeJoin.mergeSplits(thisSplits, thatSplits, { x: Int => x - 1 }) }
  }
} 
Example 43
Source File: TiHandleRDD.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.tispark

import com.pingcap.tikv.meta.TiDAGRequest
import com.pingcap.tikv.util.RangeSplitter
import com.pingcap.tikv.{TiConfiguration, TiSession}
import com.pingcap.tispark.utils.TiUtil
import com.pingcap.tispark.{TiPartition, TiTableReference}
import gnu.trove.list.array.TLongArrayList
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.{Partition, TaskContext, TaskKilledException}

import scala.collection.JavaConversions._
import scala.collection.JavaConverters._


class TiHandleRDD(
    override val dagRequest: TiDAGRequest,
    override val physicalId: Long,
    val output: Seq[Attribute],
    override val tiConf: TiConfiguration,
    override val tableRef: TiTableReference,
    @transient private val session: TiSession,
    @transient private val sparkSession: SparkSession)
    extends TiRDD(dagRequest, physicalId, tiConf, tableRef, session, sparkSession) {

  private val outputTypes = output.map(_.dataType)
  private val converters =
    outputTypes.map(CatalystTypeConverters.createToCatalystConverter)

  override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] =
    new Iterator[InternalRow] {
      checkTimezone()

      private val tiPartition = split.asInstanceOf[TiPartition]
      private val session = TiSession.getInstance(tiConf)
      private val snapshot = session.createSnapshot(dagRequest.getStartTs)
      private[this] val tasks = tiPartition.tasks

      private val handleIterator = snapshot.indexHandleRead(dagRequest, tasks)
      private val regionManager = session.getRegionManager
      private lazy val handleList = {
        val lst = new TLongArrayList()
        handleIterator.asScala.foreach {
          // Kill the task in case it has been marked as killed. This logic is from
          // InterruptedIterator, but we inline it here instead of wrapping the iterator in order
          // to avoid performance overhead.
          if (context.isInterrupted()) {
            throw new TaskKilledException
          }
          lst.add(_)
        }
        lst
      }
      // Fetch all handles and group by region id
      private val regionHandleMap = RangeSplitter
        .newSplitter(regionManager)
        .groupByAndSortHandlesByRegionId(physicalId, handleList)
        .map(x => (x._1.first.getId, x._2))

      private val iterator = regionHandleMap.iterator

      override def hasNext: Boolean = {
        // Kill the task in case it has been marked as killed.
        if (context.isInterrupted()) {
          throw new TaskKilledException
        }
        iterator.hasNext
      }

      override def next(): InternalRow = {
        val next = iterator.next
        val regionId = next._1
        val handleList = next._2

        // Returns RegionId:[handle1, handle2, handle3...] K-V pair
        val sparkRow = Row.apply(regionId, handleList.toArray())
        TiUtil.rowToInternalRow(sparkRow, outputTypes, converters)
      }
    }
} 
Example 44
Source File: TiRowRDD.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.tispark

import com.pingcap.tikv._
import com.pingcap.tikv.columnar.TiColumnarBatchHelper
import com.pingcap.tikv.meta.TiDAGRequest
import com.pingcap.tispark.listener.CacheInvalidateListener
import com.pingcap.tispark.{TiPartition, TiTableReference}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.vectorized.ColumnarBatch
import org.apache.spark.{Partition, TaskContext, TaskKilledException}
import org.slf4j.Logger

import scala.collection.JavaConversions._

class TiRowRDD(
    override val dagRequest: TiDAGRequest,
    override val physicalId: Long,
    val chunkBatchSize: Int,
    override val tiConf: TiConfiguration,
    val output: Seq[Attribute],
    override val tableRef: TiTableReference,
    @transient private val session: TiSession,
    @transient private val sparkSession: SparkSession)
    extends TiRDD(dagRequest, physicalId, tiConf, tableRef, session, sparkSession) {

  protected val logger: Logger = log

  // cache invalidation call back function
  // used for driver to update PD cache
  private val callBackFunc = CacheInvalidateListener.getInstance()

  override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] =
    new Iterator[ColumnarBatch] {
      checkTimezone()

      private val tiPartition = split.asInstanceOf[TiPartition]
      private val session = TiSession.getInstance(tiConf)
      session.injectCallBackFunc(callBackFunc)
      private val snapshot = session.createSnapshot(dagRequest.getStartTs)
      private[this] val tasks = tiPartition.tasks

      private val iterator =
        snapshot.tableReadChunk(dagRequest, tasks, chunkBatchSize)

      override def hasNext: Boolean = {
        // Kill the task in case it has been marked as killed. This logic is from
        // Interrupted Iterator, but we inline it here instead of wrapping the iterator in order
        // to avoid performance overhead.
        if (context.isInterrupted()) {
          throw new TaskKilledException
        }
        iterator.hasNext
      }

      override def next(): ColumnarBatch = {
        TiColumnarBatchHelper.createColumnarBatch(iterator.next)
      }
    }.asInstanceOf[Iterator[InternalRow]]

} 
Example 45
Source File: TiRDD.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.tispark

import com.pingcap.tikv._
import com.pingcap.tikv.exception.TiInternalException
import com.pingcap.tikv.meta.TiDAGRequest
import com.pingcap.tikv.types.Converter
import com.pingcap.tikv.util.RangeSplitter
import com.pingcap.tikv.util.RangeSplitter.RegionTask
import com.pingcap.tispark.{TiPartition, TiTableReference}
import org.apache.spark.Partition
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow

import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.collection.mutable.ListBuffer

abstract class TiRDD(
    val dagRequest: TiDAGRequest,
    val physicalId: Long,
    val tiConf: TiConfiguration,
    val tableRef: TiTableReference,
    @transient private val session: TiSession,
    @transient private val sparkSession: SparkSession)
    extends RDD[InternalRow](sparkSession.sparkContext, Nil) {

  private lazy val partitionPerSplit = tiConf.getPartitionPerSplit

  protected def checkTimezone(): Unit = {
    if (!tiConf.getLocalTimeZone.equals(Converter.getLocalTimezone)) {
      throw new TiInternalException(
        "timezone are different! driver: " + tiConf.getLocalTimeZone + " executor:" + Converter.getLocalTimezone +
          " please set user.timezone in spark.driver.extraJavaOptions and spark.executor.extraJavaOptions")
    }
  }

  override protected def getPartitions: Array[Partition] = {
    val keyWithRegionTasks = RangeSplitter
      .newSplitter(session.getRegionManager)
      .splitRangeByRegion(dagRequest.getRangesByPhysicalId(physicalId), dagRequest.getStoreType)

    val hostTasksMap = new mutable.HashMap[String, mutable.Set[RegionTask]]
      with mutable.MultiMap[String, RegionTask]

    var index = 0
    val result = new ListBuffer[TiPartition]
    for (task <- keyWithRegionTasks) {
      hostTasksMap.addBinding(task.getHost, task)
      val tasks = hostTasksMap(task.getHost)
      if (tasks.size >= partitionPerSplit) {
        result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId))
        index += 1
        hostTasksMap.remove(task.getHost)
      }

    }
    // add rest
    for (tasks <- hostTasksMap.values) {
      result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId))
      index += 1
    }
    result.toArray
  }

  override protected def getPreferredLocations(split: Partition): Seq[String] =
    split.asInstanceOf[TiPartition].tasks.head.getHost :: Nil
} 
Example 46
Source File: RedisSourceRdd.scala    From spark-redis   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package org.apache.spark.sql.redis.stream

import com.redislabs.provider.redis.RedisConfig
import com.redislabs.provider.redis.util.ConnectionUtils.withConnection
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.redis.stream.RedisSourceTypes.StreamEntry
import org.apache.spark.{Partition, SparkContext, TaskContext}


class RedisSourceRdd(sc: SparkContext, redisConfig: RedisConfig,
                     offsetRanges: Seq[RedisSourceOffsetRange], autoAck: Boolean = true)
  extends RDD[StreamEntry](sc, Nil) {

  override def compute(split: Partition, context: TaskContext): Iterator[StreamEntry] = {
    val partition = split.asInstanceOf[RedisSourceRddPartition]
    val offsetRange = partition.offsetRange
    val streamReader = new RedisStreamReader(redisConfig)
    streamReader.unreadStreamEntries(offsetRange)
  }

  override protected def getPartitions: Array[Partition] = {
    offsetRanges.zipWithIndex.map { case (e, i) => RedisSourceRddPartition(i, e) }
      .toArray
  }
}

case class RedisSourceRddPartition(index: Int, offsetRange: RedisSourceOffsetRange)
  extends Partition 
Example 47
Source File: MultiZippedPartitionRDD.scala    From spark-vlbfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.spark.{Partition, SparkContext, TaskContext}

import scala.reflect.ClassTag

private[spark] class MultiZippedPartitionsRDD[A: ClassTag, V: ClassTag](
    sc: SparkContext,
    var f: (List[Iterator[A]]) => Iterator[V],
    var rddList: List[RDD[A]],
    preservesPartitioning: Boolean = false)
  extends ZippedPartitionsBaseRDD[V](sc, rddList, preservesPartitioning) {

  override def compute(s: Partition, context: TaskContext): Iterator[V] = {
    val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions
    val iterList = rddList.zipWithIndex.map{ case (rdd: RDD[A], index: Int) =>
      rdd.iterator(partitions(index), context)
    }
    f(iterList)
  }

  override def clearDependencies() {
    super.clearDependencies()
    rddList = null
    f = null
  }
} 
Example 48
Source File: DataSourceRDD.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.v2

import scala.reflect.ClassTag

import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.v2.reader.InputPartition

class DataSourceRDDPartition[T : ClassTag](val index: Int, val inputPartition: InputPartition[T])
  extends Partition with Serializable

class DataSourceRDD[T: ClassTag](
    sc: SparkContext,
    @transient private val inputPartitions: Seq[InputPartition[T]])
  extends RDD[T](sc, Nil) {

  override protected def getPartitions: Array[Partition] = {
    inputPartitions.zipWithIndex.map {
      case (inputPartition, index) => new DataSourceRDDPartition(index, inputPartition)
    }.toArray
  }

  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
    val reader = split.asInstanceOf[DataSourceRDDPartition[T]].inputPartition
        .createPartitionReader()
    context.addTaskCompletionListener[Unit](_ => reader.close())
    val iter = new Iterator[T] {
      private[this] var valuePrepared = false

      override def hasNext: Boolean = {
        if (!valuePrepared) {
          valuePrepared = reader.next()
        }
        valuePrepared
      }

      override def next(): T = {
        if (!hasNext) {
          throw new java.util.NoSuchElementException("End of stream")
        }
        valuePrepared = false
        reader.get()
      }
    }
    new InterruptibleIterator(context, iter)
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    split.asInstanceOf[DataSourceRDDPartition[T]].inputPartition.preferredLocations()
  }
} 
Example 49
Source File: StateStoreRDD.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.state

import java.util.UUID

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.EpochTracker
import org.apache.spark.sql.internal.SessionState
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.SerializableConfiguration


  override def getPreferredLocations(partition: Partition): Seq[String] = {
    val stateStoreProviderId = StateStoreProviderId(
      StateStoreId(checkpointLocation, operatorId, partition.index),
      queryRunId)
    storeCoordinator.flatMap(_.getLocation(stateStoreProviderId)).toSeq
  }

  override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = {
    var store: StateStore = null
    val storeProviderId = StateStoreProviderId(
      StateStoreId(checkpointLocation, operatorId, partition.index),
      queryRunId)

    // If we're in continuous processing mode, we should get the store version for the current
    // epoch rather than the one at planning time.
    val isContinuous = Option(ctxt.getLocalProperty(StreamExecution.IS_CONTINUOUS_PROCESSING))
      .map(_.toBoolean).getOrElse(false)
    val currentVersion = if (isContinuous) {
      val epoch = EpochTracker.getCurrentEpoch
      assert(epoch.isDefined, "Current epoch must be defined for continuous processing streams.")
      epoch.get
    } else {
      storeVersion
    }

    store = StateStore.get(
      storeProviderId, keySchema, valueSchema, indexOrdinal, currentVersion,
      storeConf, hadoopConfBroadcast.value.value)
    val inputIter = dataRDD.iterator(partition, ctxt)
    storeUpdateFunction(store, inputIter)
  }
} 
Example 50
Source File: ContinuousWriteRDD.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.continuous

import org.apache.spark.{Partition, SparkEnv, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory}
import org.apache.spark.util.Utils


class ContinuousWriteRDD(var prev: RDD[InternalRow], writeTask: DataWriterFactory[InternalRow])
    extends RDD[Unit](prev) {

  override val partitioner = prev.partitioner

  override def getPartitions: Array[Partition] = prev.partitions

  override def compute(split: Partition, context: TaskContext): Iterator[Unit] = {
    val epochCoordinator = EpochCoordinatorRef.get(
      context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
      SparkEnv.get)
    EpochTracker.initializeCurrentEpoch(
      context.getLocalProperty(ContinuousExecution.START_EPOCH_KEY).toLong)
    while (!context.isInterrupted() && !context.isCompleted()) {
      var dataWriter: DataWriter[InternalRow] = null
      // write the data and commit this writer.
      Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
        try {
          val dataIterator = prev.compute(split, context)
          dataWriter = writeTask.createDataWriter(
            context.partitionId(),
            context.taskAttemptId(),
            EpochTracker.getCurrentEpoch.get)
          while (dataIterator.hasNext) {
            dataWriter.write(dataIterator.next())
          }
          logInfo(s"Writer for partition ${context.partitionId()} " +
            s"in epoch ${EpochTracker.getCurrentEpoch.get} is committing.")
          val msg = dataWriter.commit()
          epochCoordinator.send(
            CommitPartitionEpoch(
              context.partitionId(),
              EpochTracker.getCurrentEpoch.get,
              msg)
          )
          logInfo(s"Writer for partition ${context.partitionId()} " +
            s"in epoch ${EpochTracker.getCurrentEpoch.get} committed.")
          EpochTracker.incrementCurrentEpoch()
        } catch {
          case _: InterruptedException =>
          // Continuous shutdown always involves an interrupt. Just finish the task.
        }
      })(catchBlock = {
        // If there is an error, abort this writer. We enter this callback in the middle of
        // rethrowing an exception, so compute() will stop executing at this point.
        logError(s"Writer for partition ${context.partitionId()} is aborting.")
        if (dataWriter != null) dataWriter.abort()
        logError(s"Writer for partition ${context.partitionId()} aborted.")
      })
    }

    Iterator()
  }

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }
} 
Example 51
Source File: ContinuousShuffleReadRDD.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.continuous.shuffle

import java.util.UUID

import org.apache.spark.{Partition, SparkContext, SparkEnv, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.rpc.RpcAddress
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.util.NextIterator

case class ContinuousShuffleReadPartition(
      index: Int,
      endpointName: String,
      queueSize: Int,
      numShuffleWriters: Int,
      epochIntervalMs: Long)
    extends Partition {
  // Initialized only on the executor, and only once even as we call compute() multiple times.
  lazy val (reader: ContinuousShuffleReader, endpoint) = {
    val env = SparkEnv.get.rpcEnv
    val receiver = new RPCContinuousShuffleReader(
      queueSize, numShuffleWriters, epochIntervalMs, env)
    val endpoint = env.setupEndpoint(endpointName, receiver)

    TaskContext.get().addTaskCompletionListener[Unit] { ctx =>
      env.stop(endpoint)
    }
    (receiver, endpoint)
  }
}


class ContinuousShuffleReadRDD(
    sc: SparkContext,
    numPartitions: Int,
    queueSize: Int = 1024,
    numShuffleWriters: Int = 1,
    epochIntervalMs: Long = 1000,
    val endpointNames: Seq[String] = Seq(s"RPCContinuousShuffleReader-${UUID.randomUUID()}"))
  extends RDD[UnsafeRow](sc, Nil) {

  override protected def getPartitions: Array[Partition] = {
    (0 until numPartitions).map { partIndex =>
      ContinuousShuffleReadPartition(
        partIndex, endpointNames(partIndex), queueSize, numShuffleWriters, epochIntervalMs)
    }.toArray
  }

  override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = {
    split.asInstanceOf[ContinuousShuffleReadPartition].reader.read()
  }
} 
Example 52
Source File: AsHadoopPartition.scala    From spark-util   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.spark.Partition
import org.hammerlab.hadoop.splits.FileSplit
import org.apache.hadoop.mapreduce.lib.input
import org.apache.hadoop.mapred


object AsHadoopPartition {
  def apply(partition: Partition): HadoopPartition =
    partition.asInstanceOf[HadoopPartition]
}

object AsNewHadoopPartition {
  def apply(partition: Partition): NewHadoopPartition =
    partition.asInstanceOf[NewHadoopPartition]
}

object GetFileSplit {
  def apply(partition: Partition): FileSplit =
    partition match {
      case p: HadoopPartition ⇒
        FileSplit(
          p
            .inputSplit
            .value
            .asInstanceOf[mapred.FileSplit]
        )
      case p: NewHadoopPartition ⇒
        FileSplit(
          p
            .serializableHadoopSplit
            .value
            .asInstanceOf[input.FileSplit]
        )
      case _ ⇒
        throw NonHadoopPartition(partition)
    }
}

case class NonHadoopPartition(p: Partition)
  extends IllegalArgumentException 
Example 53
Source File: EventHubsRDD.scala    From azure-event-hubs-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.eventhubs.rdd

import com.microsoft.azure.eventhubs.EventData
import org.apache.spark.eventhubs.EventHubsConf
import org.apache.spark.eventhubs.client.CachedEventHubsReceiver
import org.apache.spark.eventhubs.utils.SimulatedCachedReceiver
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.{ Partition, SparkContext, TaskContext }


private[spark] class EventHubsRDD(sc: SparkContext,
                                  val ehConf: EventHubsConf,
                                  val offsetRanges: Array[OffsetRange])
    extends RDD[EventData](sc, Nil)
    with Logging
    with HasOffsetRanges {

  override def getPartitions: Array[Partition] =
    offsetRanges
      .sortBy(_.partitionId)
      .map(
        o =>
          new EventHubsRDDPartition(
            o.partitionId,
            o.nameAndPartition,
            o.fromSeqNo,
            o.untilSeqNo,
            o.preferredLoc
        ))

  override def count: Long = offsetRanges.map(_.count).sum

  override def isEmpty(): Boolean = count == 0L

  override def take(num: Int): Array[EventData] = {
    val nonEmptyPartitions =
      this.partitions.map(_.asInstanceOf[EventHubsRDDPartition]).filter(_.count > 0)

    if (num < 1 || nonEmptyPartitions.isEmpty) {
      return Array()
    }

    val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) =>
      val remain = num - result.values.sum
      if (remain > 0) {
        val taken = Math.min(remain, part.count)
        result + (part.index -> taken.toInt)

      } else {
        result
      }
    }

    context
      .runJob(
        this,
        (tc: TaskContext, it: Iterator[EventData]) => it.take(parts(tc.partitionId)).toArray,
        parts.keys.toArray
      )
      .flatten
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    val part = split.asInstanceOf[EventHubsRDDPartition]
    part.preferredLoc.map(Seq(_)).getOrElse(Seq.empty)
  }

  private def errBeginAfterEnd(part: EventHubsRDDPartition): String =
    s"The beginning sequence number ${part.fromSeqNo} is larger than the ending " +
      s"sequence number ${part.untilSeqNo} for EventHubs ${part.name} on partition " +
      s"${part.partitionId}."

  override def compute(partition: Partition, context: TaskContext): Iterator[EventData] = {
    val part = partition.asInstanceOf[EventHubsRDDPartition]
    assert(part.fromSeqNo <= part.untilSeqNo, errBeginAfterEnd(part))

    if (part.fromSeqNo == part.untilSeqNo) {
      logInfo(
        s"(TID ${context.taskAttemptId()}) Beginning sequence number ${part.fromSeqNo} is equal to the ending sequence " +
          s"number ${part.untilSeqNo}. Returning empty partition for EH: ${part.name} " +
          s"on partition: ${part.partitionId}")
      Iterator.empty
    } else {
      logInfo(
        s"(TID ${context.taskAttemptId()}) Computing EventHubs ${part.name}, partition ${part.partitionId} " +
          s"sequence numbers ${part.fromSeqNo} => ${part.untilSeqNo}")
      val cachedReceiver = if (ehConf.useSimulatedClient) {
        SimulatedCachedReceiver
      } else {
        CachedEventHubsReceiver
      }
      cachedReceiver.receive(ehConf,
                             part.nameAndPartition,
                             part.fromSeqNo,
                             (part.untilSeqNo - part.fromSeqNo).toInt)
    }
  }
} 
Example 54
Source File: SequoiadbRDD.scala    From spark-sequoiadb   with Apache License 2.0 5 votes vote down vote up
package com.sequoiadb.spark.rdd

import org.apache.spark.SparkContext
import _root_.com.sequoiadb.spark.SequoiadbConfig
import com.sequoiadb.spark.partitioner._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.Filter
import org.apache.spark.{Partition, TaskContext}
import org.bson.BSONObject
import org.slf4j.{Logger, LoggerFactory}
import scala.collection.mutable.ArrayBuffer
//import java.io.FileOutputStream;  


  def apply (
    sc: SQLContext,
    config: SequoiadbConfig,
    partitioner: Option[SequoiadbPartitioner] = None,
    requiredColumns: Array[String] = Array(),
    filters: Array[Filter] = Array(),
    queryReturnType: Int = SequoiadbConfig.QUERYRETURNBSON,
    queryLimit: Long = -1) = {
    new SequoiadbRDD ( sc.sparkContext, config, partitioner,
      requiredColumns, filters, queryReturnType, queryLimit)
  }
} 
Example 55
Source File: SlidingRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.rdd

import scala.collection.mutable
import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD

private[mllib]
class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T], val offset: Int)
  extends Partition with Serializable {
  override val index: Int = idx
}


private[mllib]
class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int)
  extends RDD[Array[T]](parent) {

  require(windowSize > 0 && step > 0 && !(windowSize == 1 && step == 1),
    "Window size and step must be greater than 0, " +
      s"and they cannot be both 1, but got windowSize = $windowSize and step = $step.")

  override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = {
    val part = split.asInstanceOf[SlidingRDDPartition[T]]
    (firstParent[T].iterator(part.prev, context) ++ part.tail)
      .drop(part.offset)
      .sliding(windowSize, step)
      .withPartial(false)
      .map(_.toArray)
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev)

  override def getPartitions: Array[Partition] = {
    val parentPartitions = parent.partitions
    val n = parentPartitions.length
    if (n == 0) {
      Array.empty
    } else if (n == 1) {
      Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty, 0))
    } else {
      val w1 = windowSize - 1
      // Get partition sizes and first w1 elements.
      val (sizes, heads) = parent.mapPartitions { iter =>
        val w1Array = iter.take(w1).toArray
        Iterator.single((w1Array.length + iter.length, w1Array))
      }.collect().unzip
      val partitions = mutable.ArrayBuffer.empty[SlidingRDDPartition[T]]
      var i = 0
      var cumSize = 0
      var partitionIndex = 0
      while (i < n) {
        val mod = cumSize % step
        val offset = if (mod == 0) 0 else step - mod
        val size = sizes(i)
        if (offset < size) {
          val tail = mutable.ListBuffer.empty[T]
          // Keep appending to the current tail until it has w1 elements.
          var j = i + 1
          while (j < n && tail.length < w1) {
            tail ++= heads(j).take(w1 - tail.length)
            j += 1
          }
          if (sizes(i) + tail.length >= offset + windowSize) {
            partitions +=
              new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail, offset)
            partitionIndex += 1
          }
        }
        cumSize += size
        i += 1
      }
      partitions.toArray
    }
  }

  // TODO: Override methods such as aggregate, which only requires one Spark job.
} 
Example 56
Source File: StateStoreRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.state

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.internal.SessionState
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.SerializableConfiguration


class StateStoreRDD[T: ClassTag, U: ClassTag](
    dataRDD: RDD[T],
    storeUpdateFunction: (StateStore, Iterator[T]) => Iterator[U],
    checkpointLocation: String,
    operatorId: Long,
    storeVersion: Long,
    keySchema: StructType,
    valueSchema: StructType,
    sessionState: SessionState,
    @transient private val storeCoordinator: Option[StateStoreCoordinatorRef])
  extends RDD[U](dataRDD) {

  private val storeConf = new StateStoreConf(sessionState.conf)

  // A Hadoop Configuration can be about 10 KB, which is pretty big, so broadcast it
  private val confBroadcast = dataRDD.context.broadcast(
    new SerializableConfiguration(sessionState.newHadoopConf()))

  override protected def getPartitions: Array[Partition] = dataRDD.partitions

  override def getPreferredLocations(partition: Partition): Seq[String] = {
    val storeId = StateStoreId(checkpointLocation, operatorId, partition.index)
    storeCoordinator.flatMap(_.getLocation(storeId)).toSeq
  }

  override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = {
    var store: StateStore = null
    val storeId = StateStoreId(checkpointLocation, operatorId, partition.index)
    store = StateStore.get(
      storeId, keySchema, valueSchema, storeVersion, storeConf, confBroadcast.value.value)
    val inputIter = dataRDD.iterator(partition, ctxt)
    storeUpdateFunction(store, inputIter)
  }
} 
Example 57
Source File: WholeTextFileRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.{Text, Writable}
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.task.JobContextImpl

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.WholeTextFileInputFormat


private[spark] class WholeTextFileRDD(
    sc : SparkContext,
    inputFormatClass: Class[_ <: WholeTextFileInputFormat],
    keyClass: Class[Text],
    valueClass: Class[Text],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    val conf = getConf
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = new JobContextImpl(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
} 
Example 58
Source File: ZippedWithIndexRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.Utils

private[spark]
class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


  @transient private val startIndices: Array[Long] = {
    val n = prev.partitions.length
    if (n == 0) {
      Array.empty
    } else if (n == 1) {
      Array(0L)
    } else {
      prev.context.runJob(
        prev,
        Utils.getIteratorSize _,
        0 until n - 1 // do not need to count the last partition
      ).scanLeft(0L)(_ + _)
    }
  }

  override def getPartitions: Array[Partition] = {
    firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index)))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = {
    val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition]
    val parentIter = firstParent[T].iterator(split.prev, context)
    Utils.getIteratorZipWithIndex(parentIter, split.startIndex)
  }
} 
Example 59
Source File: UnionRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 60
Source File: PartitionwiseSampledRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 61
Source File: PartitionerAwareUnionRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.nonEmpty)
  require(rdds.forall(_.partitioner.isDefined))
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map { index =>
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }.toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) =>
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
    }
    val location = if (locations.isEmpty) {
      None
    } else {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 62
Source File: BinaryFileRDD.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.Writable
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.task.JobContextImpl

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.StreamFileInputFormat

private[spark] class BinaryFileRDD[T](
    @transient private val sc: SparkContext,
    inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
    keyClass: Class[String],
    valueClass: Class[T],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    val conf = getConf
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = new JobContextImpl(conf, jobId)
    inputFormat.setMinPartitions(sc, jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
} 
Example 63
Source File: PartitionPruningRDDSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
} 
Example 64
Source File: SampledRDD.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.PoissonDistribution

import org.apache.spark.{Partition, TaskContext}

@deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0")
private[spark]
class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable {
  override val index: Int = prev.index
}

@deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0")
private[spark] class SampledRDD[T: ClassTag](
    prev: RDD[T],
    withReplacement: Boolean,
    frac: Double,
    seed: Int)
  extends RDD[T](prev) {

  override def getPartitions: Array[Partition] = {
    val rg = new Random(seed)
    firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = {
    val split = splitIn.asInstanceOf[SampledRDDPartition]
    if (withReplacement) {
      // For large datasets, the expected number of occurrences of each element in a sample with
      // replacement is Poisson(frac). We use that to get a count for each element.
      val poisson = new PoissonDistribution(frac)
      poisson.reseedRandomGenerator(split.seed)

      firstParent[T].iterator(split.prev, context).flatMap { element =>
        val count = poisson.sample()
        if (count == 0) {
          Iterator.empty  // Avoid object allocation when we return 0 items, which is quite often
        } else {
          Iterator.fill(count)(element)
        }
      }
    } else { // Sampling without replacement
      val rand = new Random(split.seed)
      firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac))
    }
  }
} 
Example 65
Source File: SubtractedRDD.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions._
import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.Dependency
import org.apache.spark.OneToOneDependency
import org.apache.spark.Partition
import org.apache.spark.Partitioner
import org.apache.spark.ShuffleDependency
import org.apache.spark.SparkEnv
import org.apache.spark.TaskContext
import org.apache.spark.serializer.Serializer


  def setSerializer(serializer: Serializer): SubtractedRDD[K, V, W] = {
    this.serializer = Option(serializer)
    this
  }

  override def getDependencies: Seq[Dependency[_]] = {
    Seq(rdd1, rdd2).map { rdd =>
      if (rdd.partitioner == Some(part)) {
        logDebug("Adding one-to-one dependency with " + rdd)
        new OneToOneDependency(rdd)
      } else {
        logDebug("Adding shuffle dependency with " + rdd)
        new ShuffleDependency(rdd, part, serializer)
      }
    }
  }

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](part.numPartitions)
    for (i <- 0 until array.size) {
      // Each CoGroupPartition will depend on rdd1 and rdd2
      array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) =>
        dependencies(j) match {
          case s: ShuffleDependency[_, _, _] =>
            new ShuffleCoGroupSplitDep(s.shuffleHandle)
          case _ =>
            new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i))
        }
      }.toArray)
    }
    array
  }

  override val partitioner = Some(part)

  override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = {
    val partition = p.asInstanceOf[CoGroupPartition]
    val map = new JHashMap[K, ArrayBuffer[V]]
    def getSeq(k: K): ArrayBuffer[V] = {
      val seq = map.get(k)
      if (seq != null) {
        seq
      } else {
        val seq = new ArrayBuffer[V]()
        map.put(k, seq)
        seq
      }
    }
    def integrate(dep: CoGroupSplitDep, op: Product2[K, V] => Unit) = dep match {
      case NarrowCoGroupSplitDep(rdd, _, itsSplit) =>
        rdd.iterator(itsSplit, context).asInstanceOf[Iterator[Product2[K, V]]].foreach(op)

      case ShuffleCoGroupSplitDep(handle) =>
        val iter = SparkEnv.get.shuffleManager
          .getReader(handle, partition.index, partition.index + 1, context)
          .read()
        iter.foreach(op)
    }
    // the first dep is rdd1; add all values to the map
    integrate(partition.deps(0), t => getSeq(t._1) += t._2)
    // the second dep is rdd2; remove all of its keys
    integrate(partition.deps(1), t => map.remove(t._1))
    map.iterator.map { t =>  t._2.iterator.map { (t._1, _) } }.flatten
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdd1 = null
    rdd2 = null
  }

} 
Example 66
Source File: ZippedWithIndexRDD.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.Utils

private[spark]
class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


  @transient private val startIndices: Array[Long] = {
    val n = prev.partitions.size
    if (n == 0) {
      Array[Long]()
    } else if (n == 1) {
      Array(0L)
    } else {
      prev.context.runJob(
        prev,
        Utils.getIteratorSize _,
        0 until n - 1, // do not need to count the last partition
        allowLocal = false
      ).scanLeft(0L)(_ + _)
    }
  }

  override def getPartitions: Array[Partition] = {
    firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index)))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = {
    val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition]
    firstParent[T].iterator(split.prev, context).zipWithIndex.map { x =>
      (x._1, split.startIndex + x._2)
    }
  }
} 
Example 67
Source File: UnionRDD.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient rdd: RDD[T],
    val parentRddIndex: Int,
    @transient parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations() = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdds.map(_.partitions.size).sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.size)
      pos += rdd.partitions.size
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 68
Source File: PartitionwiseSampledRDD.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    @transient preservesPartitioning: Boolean,
    @transient seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 69
Source File: PartitionerAwareUnionRDD.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.length > 0)
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map(index => {
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }).toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) => {
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
      }
    }
    val location = if (locations.isEmpty) {
      None
    } else  {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 70
Source File: BinaryFileRDD.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.hadoop.conf.{ Configurable, Configuration }
import org.apache.hadoop.io.Writable
import org.apache.hadoop.mapreduce._
import org.apache.spark.input.StreamFileInputFormat
import org.apache.spark.{ Partition, SparkContext }

private[spark] class BinaryFileRDD[T](
    sc: SparkContext,
    inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
    keyClass: Class[String],
    valueClass: Class[T],
    @transient conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = newJobContext(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
} 
Example 71
Source File: PartitionPruningRDDSuite.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.scalatest.FunSuite

import org.apache.spark.{Partition, SharedSparkContext, TaskContext}

class PartitionPruningRDDSuite extends FunSuite with SharedSparkContext {


  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index = i

  def testValue = this.value

} 
Example 72
Source File: CarbonDropPartitionRDD.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.spark.rdd

import java.util

import scala.collection.JavaConverters._

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.sql.SparkSession

import org.apache.carbondata.core.index.Segment
import org.apache.carbondata.core.indexstore.PartitionSpec
import org.apache.carbondata.core.metadata.SegmentFileStore

case class CarbonDropPartition(rddId: Int, idx: Int, segment: Segment)
  extends Partition {

  override val index: Int = idx

  override def hashCode(): Int = 41 * (41 + rddId) + idx
}


class CarbonDropPartitionRDD(
    @transient private val ss: SparkSession,
    tablePath: String,
    segments: Seq[Segment],
    partitions: util.List[PartitionSpec],
    uniqueId: String)
  extends CarbonRDD[(String, String)](ss, Nil) {

  override def internalGetPartitions: Array[Partition] = {
    segments.zipWithIndex.map {s =>
      CarbonDropPartition(id, s._2, s._1)
    }.toArray
  }

  override def internalCompute(
      theSplit: Partition,
      context: TaskContext): Iterator[(String, String)] = {
    val iter = new Iterator[(String, String)] {
      val split = theSplit.asInstanceOf[CarbonDropPartition]
      logInfo("Dropping partition information from : " + split.segment)
      val toBeDeletedSegments = new util.ArrayList[String]()
      val toBeUpdateSegments = new util.ArrayList[String]()
      new SegmentFileStore(
        tablePath,
        split.segment.getSegmentFileName).dropPartitions(
        split.segment,
        partitions,
        uniqueId,
        toBeDeletedSegments,
        toBeUpdateSegments)

      var finished = false

      override def hasNext: Boolean = {
        !finished
      }

      override def next(): (String, String) = {
        finished = true
        (toBeUpdateSegments.asScala.mkString(","), toBeDeletedSegments.asScala.mkString(","))
      }
    }
    iter
  }

} 
Example 73
Source File: CarbonDeltaRowScanRDD.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.spark.rdd

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.conf.Configuration
import org.apache.spark.Partition
import org.apache.spark.sql.SparkSession

import org.apache.carbondata.converter.SparkDataTypeConverterImpl
import org.apache.carbondata.core.index.IndexFilter
import org.apache.carbondata.core.indexstore.PartitionSpec
import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier
import org.apache.carbondata.core.metadata.schema.table.{CarbonTable, TableInfo}
import org.apache.carbondata.core.mutate.CarbonUpdateUtil
import org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager
import org.apache.carbondata.core.util.DataTypeConverter
import org.apache.carbondata.hadoop.{CarbonMultiBlockSplit, CarbonProjection}
import org.apache.carbondata.hadoop.api.CarbonTableInputFormat
import org.apache.carbondata.hadoop.readsupport.CarbonReadSupport
import org.apache.carbondata.spark.InitInputMetrics


class CarbonDeltaRowScanRDD[T: ClassTag](
    @transient private val spark: SparkSession,
    @transient private val serializedTableInfo: Array[Byte],
    @transient private val tableInfo: TableInfo,
    @transient override val partitionNames: Seq[PartitionSpec],
    override val columnProjection: CarbonProjection,
    var filter: IndexFilter,
    identifier: AbsoluteTableIdentifier,
    inputMetricsStats: InitInputMetrics,
    override val dataTypeConverterClz: Class[_ <: DataTypeConverter] =
    classOf[SparkDataTypeConverterImpl],
    override val readSupportClz: Class[_ <: CarbonReadSupport[_]] =
    SparkReadSupport.readSupportClass,
    deltaVersionToRead: String) extends
  CarbonScanRDD[T](
    spark,
    columnProjection,
    filter,
    identifier,
    serializedTableInfo,
    tableInfo,
    inputMetricsStats,
    partitionNames,
    dataTypeConverterClz,
    readSupportClz) {
  override def internalGetPartitions: Array[Partition] = {
    val table = CarbonTable.buildFromTableInfo(getTableInfo)
    val updateStatusManager = new SegmentUpdateStatusManager(table, deltaVersionToRead)

    val parts = super.internalGetPartitions
    parts.map { p =>
      val partition = p.asInstanceOf[CarbonSparkPartition]
      val splits = partition.multiBlockSplit.getAllSplits.asScala.filter { s =>
        updateStatusManager.getDetailsForABlock(
          CarbonUpdateUtil.getSegmentBlockNameKey(s.getSegmentId,
            s.getBlockPath,
            table.isHivePartitionTable)) != null
      }.asJava
      new CarbonSparkPartition(partition.rddId, partition.index,
        new CarbonMultiBlockSplit(splits, partition.multiBlockSplit.getLocations))
    }.filter(p => p.multiBlockSplit.getAllSplits.size() > 0).zipWithIndex.map{ case (p, index) =>
      new CarbonSparkPartition(p.rddId, index, p.multiBlockSplit)
    }.asInstanceOf[Array[Partition]]
  }

  override def createInputFormat(conf: Configuration): CarbonTableInputFormat[Object] = {
    val format = super.createInputFormat(conf)
    conf.set("updateDeltaVersion", deltaVersionToRead)
    conf.set("readDeltaOnly", "true")
    format
  }
} 
Example 74
Source File: CarbonRDD.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.spark.rdd

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.hadoop.conf.Configuration
import org.apache.spark.{Dependency, OneToOneDependency, Partition, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.util.SparkSQLUtil

import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.metadata.schema.table.TableInfo
import org.apache.carbondata.core.util._


abstract class CarbonRDDWithTableInfo[T: ClassTag](
    @transient private val ss: SparkSession,
    @transient private var deps: Seq[Dependency[_]],
    serializedTableInfo: Array[Byte]) extends CarbonRDD[T](ss, deps) {

  def this(@transient sparkSession: SparkSession, @transient oneParent: RDD[_],
      serializedTableInfo: Array[Byte]) = {
    this (sparkSession, List(new OneToOneDependency(oneParent)), serializedTableInfo)
  }

  def getTableInfo: TableInfo = TableInfo.deserialize(serializedTableInfo)
} 
Example 75
Source File: QueryTaskCompletionListener.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.spark.rdd

import scala.collection.JavaConverters._

import org.apache.hadoop.mapreduce.RecordReader
import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.sql.carbondata.execution.datasources.tasklisteners.CarbonQueryTaskCompletionListener
import org.apache.spark.sql.profiler.{Profiler, QueryTaskEnd}

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.memory.UnsafeMemoryManager
import org.apache.carbondata.core.stats.{QueryStatistic, QueryStatisticsConstants, QueryStatisticsRecorder}
import org.apache.carbondata.core.util.{DataTypeUtil, TaskMetricsMap, ThreadLocalTaskInfo}
import org.apache.carbondata.spark.InitInputMetrics

class QueryTaskCompletionListener(freeMemory: Boolean,
    var reader: RecordReader[Void, Object],
    inputMetricsStats: InitInputMetrics, executionId: String, taskId: Int, queryStartTime: Long,
    queryStatisticsRecorder: QueryStatisticsRecorder, split: Partition, queryId: String)
  extends CarbonQueryTaskCompletionListener {
  override def onTaskCompletion(context: TaskContext): Unit = {
    if (reader != null) {
      try {
        reader.close()
      } catch {
        case e: Exception =>
          LogServiceFactory.getLogService(this.getClass.getCanonicalName).error(e)
      }
      reader = null
    }
    TaskMetricsMap.getInstance().updateReadBytes(Thread.currentThread().getId)
    inputMetricsStats.updateAndClose()
    logStatistics(executionId, taskId, queryStartTime, queryStatisticsRecorder, split)
    if (freeMemory) {
      UnsafeMemoryManager.INSTANCE
        .freeMemoryAll(ThreadLocalTaskInfo.getCarbonTaskInfo.getTaskId)
      ThreadLocalTaskInfo.clearCarbonTaskInfo()
      DataTypeUtil.clearFormatter()
    }
  }

  def logStatistics(
      executionId: String,
      taskId: Long,
      queryStartTime: Long,
      recorder: QueryStatisticsRecorder,
      split: Partition
  ): Unit = {
    if (null != recorder) {
      val queryStatistic = new QueryStatistic()
      queryStatistic.addFixedTimeStatistic(QueryStatisticsConstants.EXECUTOR_PART,
        System.currentTimeMillis - queryStartTime)
      recorder.recordStatistics(queryStatistic)
      // print executor query statistics for each task_id
      val statistics = recorder.statisticsForTask(taskId, queryStartTime)
      if (statistics != null && executionId != null) {
        Profiler.invokeIfEnable {
          val inputSplit = split.asInstanceOf[CarbonSparkPartition].split.value
          inputSplit.calculateLength()
          val size = inputSplit.getLength
          val files = inputSplit.getAllSplits.asScala.map { s =>
            s.getSegmentId + "/" + s.getPath.getName
          }.toArray[String]
          Profiler.send(
            QueryTaskEnd(
              executionId.toLong,
              queryId,
              statistics.getValues,
              size,
              files
            )
          )
        }
      }
      recorder.logStatisticsForTask(statistics)
    }
  }
} 
Example 76
Source File: CarbonMergeBloomIndexFilesRDD.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.index

import scala.collection.JavaConverters._

import org.apache.spark.Partition
import org.apache.spark.rdd.CarbonMergeFilePartition
import org.apache.spark.sql.SparkSession
import org.apache.spark.TaskContext

import org.apache.carbondata.core.metadata.schema.table.CarbonTable
import org.apache.carbondata.core.util.path.CarbonTablePath
import org.apache.carbondata.index.bloom.BloomIndexFileStore
import org.apache.carbondata.spark.rdd.CarbonRDD



class CarbonMergeBloomIndexFilesRDD(
  @transient private val ss: SparkSession,
  carbonTable: CarbonTable,
  segmentIds: Seq[String],
  bloomIndexNames: Seq[String],
  bloomIndexColumns: Seq[Seq[String]])
  extends CarbonRDD[String](ss, Nil) {

  override def internalGetPartitions: Array[Partition] = {
    segmentIds.zipWithIndex.map {s =>
      CarbonMergeFilePartition(id, s._2, s._1)
    }.toArray
  }

  override def internalCompute(theSplit: Partition, context: TaskContext): Iterator[String] = {
    val tablePath = carbonTable.getTablePath
    val split = theSplit.asInstanceOf[CarbonMergeFilePartition]
    logInfo("Merging bloom index files of " +
      s"segment ${split.segmentId} for ${carbonTable.getTableName}")

    bloomIndexNames.zipWithIndex.map( dm => {
      val dmSegmentPath = CarbonTablePath.getIndexesStorePath(
        tablePath, split.segmentId, dm._1)
      BloomIndexFileStore.mergeBloomIndexFile(dmSegmentPath, bloomIndexColumns(dm._2).asJava)
    })

    val iter = new Iterator[String] {
      var havePair = false
      var finished = false

      override def hasNext: Boolean = {
        if (!finished && !havePair) {
          finished = true
          havePair = !finished
        }
        !finished
      }

      override def next(): String = {
        if (!hasNext) {
          throw new java.util.NoSuchElementException("End of stream")
        }
        havePair = false
        ""
      }
    }
    iter
  }

} 
Example 77
Source File: SegmentPruneRDD.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.indexserver

import scala.collection.JavaConverters._

import org.apache.spark.{Partition, SparkEnv, TaskContext}
import org.apache.spark.sql.SparkSession

import org.apache.carbondata.core.cache.CacheProvider
import org.apache.carbondata.core.index.{IndexInputFormat, IndexStoreManager}
import org.apache.carbondata.core.index.dev.expr.IndexInputSplitWrapper
import org.apache.carbondata.core.indexstore.SegmentWrapper
import org.apache.carbondata.spark.rdd.CarbonRDD

class SegmentPruneRDD(@transient private val ss: SparkSession,
    indexInputFormat: IndexInputFormat)
  extends CarbonRDD[(String, SegmentWrapper)](ss, Nil) {

  override protected def getPreferredLocations(split: Partition): Seq[String] = {
    val locations = split.asInstanceOf[IndexRDDPartition].getLocations
    if (locations != null) {
      locations.toSeq
    } else {
      Seq()
    }
  }

  override protected def internalGetPartitions: Array[Partition] = {
    new DistributedPruneRDD(ss, indexInputFormat).partitions
  }

  override def internalCompute(split: Partition,
      context: TaskContext): Iterator[(String, SegmentWrapper)] = {
    val inputSplits = split.asInstanceOf[IndexRDDPartition].inputSplit
    val segments = inputSplits.map(_
      .asInstanceOf[IndexInputSplitWrapper].getDistributable.getSegment)
    segments.foreach(_.setReadCommittedScope(indexInputFormat.getReadCommittedScope))
    if (indexInputFormat.getInvalidSegments.size > 0) {
      // clear the segmentMap and from cache in executor when there are invalid segments
      IndexStoreManager.getInstance().clearInvalidSegments(indexInputFormat.getCarbonTable,
        indexInputFormat.getInvalidSegments)
    }
    val blockletMap = IndexStoreManager.getInstance
      .getDefaultIndex(indexInputFormat.getCarbonTable)
    val prunedSegments = blockletMap
      .pruneSegments(segments.toList.asJava, indexInputFormat.getFilterResolverIntf)
    val executorIP = s"${ SparkEnv.get.blockManager.blockManagerId.host }_${
      SparkEnv.get.blockManager.blockManagerId.executorId
    }"
    val cacheSize = if (CacheProvider.getInstance().getCarbonCache != null) {
      CacheProvider.getInstance().getCarbonCache.getCurrentSize
    } else {
      0L
    }
    val value = (executorIP + "_" + cacheSize.toString, new SegmentWrapper(prunedSegments))
    Iterator(value)
  }
} 
Example 78
Source File: DistributedCountRDD.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.indexserver

import java.util.concurrent.Executors

import scala.collection.JavaConverters._
import scala.concurrent.{Await, ExecutionContext, ExecutionContextExecutor, Future}
import scala.concurrent.duration.Duration

import org.apache.hadoop.mapred.TaskAttemptID
import org.apache.hadoop.mapreduce.{InputSplit, TaskType}
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
import org.apache.spark.{Partition, SparkEnv, TaskContext}
import org.apache.spark.sql.SparkSession

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.cache.CacheProvider
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.index.{IndexInputFormat, IndexStoreManager}
import org.apache.carbondata.core.index.dev.expr.IndexInputSplitWrapper
import org.apache.carbondata.core.util.{CarbonProperties, CarbonThreadFactory}
import org.apache.carbondata.spark.rdd.CarbonRDD


class DistributedCountRDD(@transient ss: SparkSession, indexInputFormat: IndexInputFormat)
  extends CarbonRDD[(String, String)](ss, Nil) {

  @transient private val LOGGER = LogServiceFactory.getLogService(classOf[DistributedPruneRDD]
    .getName)

  override protected def getPreferredLocations(split: Partition): Seq[String] = {
    if (split.asInstanceOf[IndexRDDPartition].getLocations != null) {
      split.asInstanceOf[IndexRDDPartition].getLocations.toSeq
    } else {
      Seq()
    }
  }

  override def internalCompute(split: Partition,
      context: TaskContext): Iterator[(String, String)] = {
    val attemptId = new TaskAttemptID(DistributedRDDUtils.generateTrackerId,
      id, TaskType.MAP, split.index, 0)
    val attemptContext = new TaskAttemptContextImpl(FileFactory.getConfiguration, attemptId)
    val inputSplits = split.asInstanceOf[IndexRDDPartition].inputSplit
    val numOfThreads = CarbonProperties.getInstance().getNumOfThreadsForExecutorPruning
    val service = Executors
      .newFixedThreadPool(numOfThreads, new CarbonThreadFactory("IndexPruningPool", true))
    implicit val ec: ExecutionContextExecutor = ExecutionContext
      .fromExecutor(service)
    if (indexInputFormat.ifAsyncCall()) {
      // to clear cache of invalid segments during pre-priming in index server
      IndexStoreManager.getInstance().clearInvalidSegments(indexInputFormat.getCarbonTable,
        indexInputFormat.getInvalidSegments)
    }
    val futures = if (inputSplits.length <= numOfThreads) {
      inputSplits.map {
        split => generateFuture(Seq(split))
      }
    } else {
      DistributedRDDUtils.groupSplits(inputSplits, numOfThreads).map {
        splits => generateFuture(splits)
      }
    }
    // scalastyle:off awaitresult
    val results = Await.result(Future.sequence(futures), Duration.Inf).flatten
    // scalastyle:on awaitresult
    val executorIP = s"${ SparkEnv.get.blockManager.blockManagerId.host }_${
      SparkEnv.get.blockManager.blockManagerId.executorId
    }"
    val cacheSize = if (CacheProvider.getInstance().getCarbonCache != null) {
      CacheProvider.getInstance().getCarbonCache.getCurrentSize
    } else {
      0L
    }
    Iterator((executorIP + "_" + cacheSize.toString, results.map(_._2.toLong).sum.toString))
  }

  override protected def internalGetPartitions: Array[Partition] = {
    new DistributedPruneRDD(ss, indexInputFormat).partitions
  }

  private def generateFuture(split: Seq[InputSplit])
    (implicit executionContext: ExecutionContext) = {
    Future {
      val segments = split.map { inputSplit =>
        val distributable = inputSplit.asInstanceOf[IndexInputSplitWrapper]
        distributable.getDistributable.getSegment
          .setReadCommittedScope(indexInputFormat.getReadCommittedScope)
        distributable.getDistributable.getSegment
      }
      val defaultIndex = IndexStoreManager.getInstance
        .getIndex(indexInputFormat.getCarbonTable, split.head
          .asInstanceOf[IndexInputSplitWrapper].getDistributable.getIndexSchema)
      defaultIndex.getBlockRowCount(defaultIndex, segments.toList.asJava, indexInputFormat
        .getPartitions).asScala
    }
  }

} 
Example 79
Source File: DistributedShowCacheRDD.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.indexserver

import scala.collection.JavaConverters._

import org.apache.spark.{Partition, SparkEnv, TaskContext}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.hive.DistributionUtil

import org.apache.carbondata.core.index.IndexStoreManager
import org.apache.carbondata.core.indexstore.blockletindex.BlockletIndexFactory
import org.apache.carbondata.hadoop.CarbonInputSplit
import org.apache.carbondata.spark.rdd.CarbonRDD

class DistributedShowCacheRDD(@transient private val ss: SparkSession,
    tableUniqueId: String,
    executorCache: Boolean)
  extends CarbonRDD[String](ss, Nil) {

  val executorsList: Array[String] = DistributionUtil
    .getExecutors(ss.sparkContext).flatMap {
      case (host, executors) =>
        executors.map { executor =>
          s"executor_${ host }_$executor"
        }
    }.toArray

  override protected def getPreferredLocations(split: Partition): Seq[String] = {
    if (split.asInstanceOf[IndexRDDPartition].getLocations != null) {
      split.asInstanceOf[IndexRDDPartition].getLocations.toSeq
    } else {
      Seq()
    }
  }

  override protected def internalGetPartitions: Array[Partition] = {
    executorsList.zipWithIndex.map {
      case (executor, idx) =>
        // create a dummy split for each executor to accumulate the cache size.
        val dummySplit = new CarbonInputSplit()
        dummySplit.setLocation(Array(executor))
        new IndexRDDPartition(id, idx, List(dummySplit), Array(executor))
    }
  }

  override def internalCompute(split: Partition, context: TaskContext): Iterator[String] = {
    val indexes = IndexStoreManager.getInstance().getTableIndexForAllTables.asScala
    val tableList = tableUniqueId.split(",")
    val iterator = indexes.collect {
      case (tableId, tableIndexes) if tableUniqueId.isEmpty || tableList.contains(tableId) =>
        val sizeAndIndexLengths = tableIndexes.asScala
          .map { index =>
            val indexName = if (index.getIndexFactory.isInstanceOf[BlockletIndexFactory]) {
              index
                .getIndexFactory
                .asInstanceOf[BlockletIndexFactory]
                .getCarbonTable
                .getTableUniqueName
            } else {
              index.getIndexSchema.getRelationIdentifier.getDatabaseName + "_" + index
                .getIndexSchema.getIndexName
            }
            if (executorCache) {
              val executorIP = s"${ SparkEnv.get.blockManager.blockManagerId.host }_${
                SparkEnv.get.blockManager.blockManagerId.executorId
              }"
              s"${ executorIP }:${ index.getIndexFactory.getCacheSize }:${
                index.getIndexSchema.getProviderName
              }"
            } else {
              s"${indexName}:${index.getIndexFactory.getCacheSize}:${
                index.getIndexSchema.getProviderName
              }"
            }
          }
        sizeAndIndexLengths
    }.flatten.toIterator
    iterator
  }
} 
Example 80
Source File: InvalidateSegmentCacheRDD.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.indexserver

import scala.collection.JavaConverters._

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.hive.DistributionUtil

import org.apache.carbondata.core.index.IndexStoreManager
import org.apache.carbondata.core.metadata.schema.table.CarbonTable
import org.apache.carbondata.hadoop.CarbonInputSplit
import org.apache.carbondata.spark.rdd.CarbonRDD

class InvalidateSegmentCacheRDD(@transient private val ss: SparkSession, carbonTable: CarbonTable,
    invalidSegmentIds: List[String]) extends CarbonRDD[String](ss, Nil) {

  val executorsList: Array[String] = DistributionUtil.getExecutors(ss.sparkContext).flatMap {
    case (host, executors) =>
      executors.map {
        executor => s"executor_${host}_$executor"
      }
  }.toArray

  override def internalCompute(split: Partition,
      context: TaskContext): Iterator[String] = {
    IndexStoreManager.getInstance().clearInvalidSegments(carbonTable, invalidSegmentIds.asJava)
    Iterator.empty
  }

  override protected def getPreferredLocations(split: Partition): Seq[String] = {
    if (split.asInstanceOf[IndexRDDPartition].getLocations != null) {
      split.asInstanceOf[IndexRDDPartition].getLocations.toSeq
    } else {
      Seq()
    }
  }

  override protected def internalGetPartitions: Array[Partition] = {
    if (invalidSegmentIds.isEmpty) {
      Array()
    } else {
      executorsList.zipWithIndex.map {
        case (executor, idx) =>
          // create a dummy split for each executor to accumulate the cache size.
          val dummySplit = new CarbonInputSplit()
          dummySplit.setLocation(Array(executor))
          new IndexRDDPartition(id, idx, List(dummySplit), Array(executor))
      }
    }
  }
} 
Example 81
Source File: MemsqlRDD.scala    From memsql-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.memsql.spark

import java.sql.{Connection, PreparedStatement, ResultSet}

import com.memsql.spark.SQLGen.VariableList
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils}
import org.apache.spark.sql.types._
import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}

case class MemsqlRDD(query: String,
                     variables: VariableList,
                     options: MemsqlOptions,
                     schema: StructType,
                     expectedOutput: Seq[Attribute],
                     @transient val sc: SparkContext)
    extends RDD[Row](sc, Nil) {

  override protected def getPartitions: Array[Partition] =
    MemsqlQueryHelpers.GetPartitions(options, query, variables)

  override def compute(rawPartition: Partition, context: TaskContext): Iterator[Row] = {
    var closed                     = false
    var rs: ResultSet              = null
    var stmt: PreparedStatement    = null
    var conn: Connection           = null
    var partition: MemsqlPartition = rawPartition.asInstanceOf[MemsqlPartition]

    def tryClose(name: String, what: AutoCloseable): Unit = {
      try {
        if (what != null) { what.close() }
      } catch {
        case e: Exception => logWarning(s"Exception closing $name", e)
      }
    }

    def close(): Unit = {
      if (closed) { return }
      tryClose("resultset", rs)
      tryClose("statement", stmt)
      tryClose("connection", conn)
      closed = true
    }

    context.addTaskCompletionListener { context =>
      close()
    }

    conn = JdbcUtils.createConnectionFactory(partition.connectionInfo)()
    stmt = conn.prepareStatement(partition.query)
    JdbcHelpers.fillStatement(stmt, partition.variables)
    rs = stmt.executeQuery()

    var rowsIter = JdbcUtils.resultSetToRows(rs, schema)

    if (expectedOutput.nonEmpty) {
      val schemaDatatypes   = schema.map(_.dataType)
      val expectedDatatypes = expectedOutput.map(_.dataType)

      if (schemaDatatypes != expectedDatatypes) {
        val columnEncoders = schemaDatatypes.zip(expectedDatatypes).zipWithIndex.map {
          case ((_: StringType, _: NullType), _)     => ((_: Row) => null)
          case ((_: ShortType, _: BooleanType), i)   => ((r: Row) => r.getShort(i) != 0)
          case ((_: IntegerType, _: BooleanType), i) => ((r: Row) => r.getInt(i) != 0)
          case ((_: LongType, _: BooleanType), i)    => ((r: Row) => r.getLong(i) != 0)

          case ((l, r), i) => {
            options.assert(l == r, s"MemsqlRDD: unable to encode ${l} into ${r}")
            ((r: Row) => r.get(i))
          }
        }

        rowsIter = rowsIter
          .map(row => Row.fromSeq(columnEncoders.map(_(row))))
      }
    }

    CompletionIterator[Row, Iterator[Row]](new InterruptibleIterator[Row](context, rowsIter), close)
  }

} 
Example 82
Source File: DatasourceRDD.scala    From datasource-receiver   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.datasource.receiver

import org.apache.spark.partial.{BoundedDouble, CountEvaluator, PartialResult}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.streaming.datasource.config.ParametersUtils
import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetOperator}
import org.apache.spark.{Logging, Partition, TaskContext}

private[datasource]
class DatasourceRDD(
                     @transient sqlContext: SQLContext,
                     inputSentences: InputSentences,
                     datasourceParams: Map[String, String]
                   ) extends RDD[Row](sqlContext.sparkContext, Nil) with Logging with ParametersUtils {

  private var totalCalculated: Option[Long] = None

  private val InitTableName = "initTable"
  private val LimitedTableName = "limitedTable"
  private val TempInitQuery = s"select * from $InitTableName"

  val dataFrame = inputSentences.offsetConditions.fold(sqlContext.sql(inputSentences.query)) { case offset =>
    val parsedQuery = parseInitialQuery
    val conditionsSentence = offset.fromOffset.extractConditionSentence(parsedQuery)
    val orderSentence = offset.fromOffset.extractOrderSentence(parsedQuery, inverse = offset.limitRecords.isEmpty)
    val limitSentence = inputSentences.extractLimitSentence

    sqlContext.sql(parsedQuery + conditionsSentence + orderSentence + limitSentence)
  }

  private def parseInitialQuery: String = {
    if (inputSentences.query.toUpperCase.contains("WHERE") ||
      inputSentences.query.toUpperCase.contains("ORDER") ||
      inputSentences.query.toUpperCase.contains("LIMIT")
    ) {
      sqlContext.sql(inputSentences.query).registerTempTable(InitTableName)
      TempInitQuery
    } else inputSentences.query
  }

  def progressInputSentences: InputSentences = {
    if (!dataFrame.rdd.isEmpty()) {
      inputSentences.offsetConditions.fold(inputSentences) { case offset =>

        val offsetValue = if (offset.limitRecords.isEmpty)
          dataFrame.rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name))
        else {
          dataFrame.registerTempTable(LimitedTableName)
          val limitedQuery = s"select * from $LimitedTableName order by ${offset.fromOffset.name} " +
            s"${OffsetOperator.toInverseOrderOperator(offset.fromOffset.operator)} limit 1"

          sqlContext.sql(limitedQuery).rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name))
        }

        inputSentences.copy(offsetConditions = Option(offset.copy(fromOffset = offset.fromOffset.copy(
          value = Option(offsetValue),
          operator = OffsetOperator.toProgressOperator(offset.fromOffset.operator)))))
      }
    } else inputSentences
  }

  
  override def isEmpty(): Boolean = {
    totalCalculated.fold {
      withScope {
        partitions.length == 0 || take(1).length == 0
      }
    } { total => total == 0L }
  }

  override def getPartitions: Array[Partition] = dataFrame.rdd.partitions

  override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = dataFrame.rdd.compute(thePart, context)

  override def getPreferredLocations(thePart: Partition): Seq[String] = dataFrame.rdd.preferredLocations(thePart)
} 
Example 83
Source File: NetezzaRDD.scala    From spark-netezza   with Apache License 2.0 5 votes vote down vote up
package com.ibm.spark.netezza

import java.sql.Connection
import java.util.Properties

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types._
import org.apache.spark.{Partition, SparkContext, TaskContext}


  override def compute(thePart: Partition, context: TaskContext): Iterator[Row] =
    new Iterator[Row] {
      var closed = false
      var finished = false
      var gotNext = false
      var nextValue: Row = null

      context.addTaskCompletionListener { context => close() }
      val part = thePart.asInstanceOf[NetezzaPartition]
      val conn = getConnection()
      val reader = new NetezzaDataReader(conn, table, columns, filters, part, schema)
      reader.startExternalTableDataUnload()

      def getNext(): Row = {
        if (reader.hasNext) {
          reader.next()
        } else {
          finished = true
          null.asInstanceOf[Row]
        }
      }

      def close() {
        if (closed) return
        try {
          if (null != reader) {
            reader.close()
          }
        } catch {
          case e: Exception => logWarning("Exception closing Netezza record reader", e)
        }
        try {
          if (null != conn) {
            conn.close()
          }
          logInfo("closed connection")
        } catch {
          case e: Exception => logWarning("Exception closing connection", e)
        }
      }

      override def hasNext: Boolean = {
        if (!finished) {
          if (!gotNext) {
            nextValue = getNext()
            if (finished) {
              close()
            }
            gotNext = true
          }
        }
        !finished
      }

      override def next(): Row = {
        if (!hasNext) {
          throw new NoSuchElementException("End of stream")
        }
        gotNext = false
        nextValue
      }
    }
} 
Example 84
Source File: ReorderedPartitionsRDD.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.sparkextras

import is.hail.utils.FastSeq
import org.apache.spark.rdd.RDD
import org.apache.spark.{Dependency, NarrowDependency, Partition, TaskContext}

import scala.reflect.ClassTag

case class ReorderedPartitionsRDDPartition(index: Int, oldPartition: Partition) extends Partition

class ReorderedPartitionsRDD[T](@transient var prev: RDD[T], @transient val oldIndices: Array[Int])(implicit tct: ClassTag[T])
  extends RDD[T](prev.sparkContext, Nil) {

  override def getPartitions: Array[Partition] = {
    val parentPartitions = dependencies.head.rdd.asInstanceOf[RDD[T]].partitions
    Array.tabulate(oldIndices.length) { i =>
      val oldIndex = oldIndices(i)
      val oldPartition = parentPartitions(oldIndex)
      ReorderedPartitionsRDDPartition(i, oldPartition)
    }
  }

  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
    val parent = dependencies.head.rdd.asInstanceOf[RDD[T]]
    parent.compute(split.asInstanceOf[ReorderedPartitionsRDDPartition].oldPartition, context)
  }

  override def getDependencies: Seq[Dependency[_]] = FastSeq(new NarrowDependency[T](prev) {
    override def getParents(partitionId: Int): Seq[Int] = FastSeq(oldIndices(partitionId))
  })

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }

  override def getPreferredLocations(partition: Partition): Seq[String] =
    prev.preferredLocations(partition.asInstanceOf[ReorderedPartitionsRDDPartition].oldPartition)
} 
Example 85
Source File: MapPartitionsWithValueRDD.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.sparkextras

import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, TaskContext}

import scala.annotation.meta.param
import scala.reflect.ClassTag

case class MapPartitionsWithValueRDDPartition[V](
  parentPartition: Partition,
  value: V) extends Partition {
  def index: Int = parentPartition.index
}

class MapPartitionsWithValueRDD[T: ClassTag, U: ClassTag, V](
  var prev: RDD[T],
  @(transient @param) values: Array[V],
  f: (Int, V, Iterator[T]) => Iterator[U],
  preservesPartitioning: Boolean) extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) firstParent[T].partitioner else None

  override def getPartitions: Array[Partition] = {
    firstParent[T].partitions.map(p => MapPartitionsWithValueRDDPartition(p, values(p.index)))
  }

  override def compute(split: Partition, context: TaskContext): Iterator[U] = {
    val p = split.asInstanceOf[MapPartitionsWithValueRDDPartition[V]]
    f(split.index, p.value, firstParent[T].iterator(p.parentPartition, context))
  }

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }
} 
Example 86
Source File: BlockedRDD.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.sparkextras

import is.hail.utils._
import org.apache.spark.rdd.RDD
import org.apache.spark.{Dependency, NarrowDependency, Partition, TaskContext}

import scala.language.existentials
import scala.reflect.ClassTag

case class BlockedRDDPartition(@transient rdd: RDD[_],
  index: Int,
  first: Int,
  last: Int) extends Partition {
  require(first <= last)

  val parentPartitions: Array[Partition] = range.map(rdd.partitions).toArray

  def range: Range = first to last
}

class BlockedRDD[T](@transient var prev: RDD[T],
  @transient val partFirst: Array[Int],
  @transient val partLast: Array[Int]
)(implicit tct: ClassTag[T]) extends RDD[T](prev.sparkContext, Nil) {
  assert(partFirst.length == partLast.length)

  override def getPartitions: Array[Partition] = {
    Array.tabulate[Partition](partFirst.length)(i =>
      BlockedRDDPartition(prev, i, partFirst(i), partLast(i)))
  }

  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
    val parent = dependencies.head.rdd.asInstanceOf[RDD[T]]
    split.asInstanceOf[BlockedRDDPartition].parentPartitions.iterator.flatMap(p =>
      parent.iterator(p, context))
  }

  override def getDependencies: Seq[Dependency[_]] = {
    FastSeq(new NarrowDependency(prev) {
      def getParents(id: Int): Seq[Int] =
        partitions(id).asInstanceOf[BlockedRDDPartition].range
    })
  }

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }

  override def getPreferredLocations(partition: Partition): Seq[String] = {
    val prevPartitions = prev.partitions
    val range = partition.asInstanceOf[BlockedRDDPartition].range

    val locationAvail = range.flatMap(i =>
      prev.preferredLocations(prevPartitions(i)))
      .groupBy(identity)
      .mapValues(_.length)

    if (locationAvail.isEmpty)
      return FastSeq.empty[String]

    val m = locationAvail.values.max
    locationAvail.filter(_._2 == m)
      .keys
      .toFastSeq
  }
} 
Example 87
Source File: IndexReadRDD.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.sparkextras

import is.hail.backend.spark.SparkBackend
import is.hail.utils.Interval
import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

case class IndexedFilePartition(index: Int, file: String, bounds: Option[Interval]) extends Partition

class IndexReadRDD[T: ClassTag](
  @transient val partFiles: Array[String],
  @transient val intervalBounds: Option[Array[Interval]],
  f: (IndexedFilePartition, TaskContext) => T
) extends RDD[T](SparkBackend.sparkContext("IndexReadRDD"), Nil) {
  def getPartitions: Array[Partition] =
    Array.tabulate(partFiles.length) { i =>
      IndexedFilePartition(i, partFiles(i), intervalBounds.map(_(i)))
    }

  override def compute(
    split: Partition, context: TaskContext
  ): Iterator[T] = {
    Iterator.single(f(split.asInstanceOf[IndexedFilePartition], context))
  }
} 
Example 88
Source File: MultiWayZipPartitionsRDD.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.sparkextras

import org.apache.spark.rdd.RDD
import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}

import scala.reflect.ClassTag

object MultiWayZipPartitionsRDD {
  def apply[T: ClassTag , V: ClassTag](
    rdds: IndexedSeq[RDD[T]]
  )(f: (Array[Iterator[T]]) => Iterator[V]): MultiWayZipPartitionsRDD[T, V] = {
    new MultiWayZipPartitionsRDD(rdds.head.sparkContext, rdds, f)
  }
}

private case class MultiWayZipPartition(val index: Int, val partitions: IndexedSeq[Partition])
  extends Partition

class MultiWayZipPartitionsRDD[T: ClassTag, V: ClassTag](
  sc: SparkContext,
  var rdds: IndexedSeq[RDD[T]],
  var f: (Array[Iterator[T]]) => Iterator[V]
) extends RDD[V](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.length > 0)
  private val numParts = rdds(0).partitions.length
  require(rdds.forall(rdd => rdd.partitions.length == numParts))

  override val partitioner = None

  override def getPartitions: Array[Partition] = {
    Array.tabulate[Partition](numParts) { i =>
      MultiWayZipPartition(i, rdds.map(rdd => rdd.partitions(i)))
    }
  }

  override def compute(s: Partition, tc: TaskContext) = {
    val partitions = s.asInstanceOf[MultiWayZipPartition].partitions
    val arr = Array.tabulate(rdds.length)(i => rdds(i).iterator(partitions(i), tc))
    f(arr)
  }

  override def clearDependencies() {
    super.clearDependencies
    rdds = null
    f = null
  }
} 
Example 89
Source File: OriginUnionRDD.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.sparkextras

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.rdd.RDD

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

private[hail] class OriginUnionPartition(
  val index: Int,
  val originIdx: Int,
  val originPart: Partition
) extends Partition

class OriginUnionRDD[T: ClassTag, S: ClassTag](
  sc: SparkContext,
  var rdds: IndexedSeq[RDD[T]],
  f: (Int, Int, Iterator[T]) => Iterator[S]
) extends RDD[S](sc, Nil) {
  override def getPartitions: Array[Partition] = {
    val arr = new Array[Partition](rdds.map(_.partitions.length).sum)
    var i = 0
    for ((rdd, rddIdx) <- rdds.zipWithIndex; part <- rdd.partitions) {
      arr(i) = new OriginUnionPartition(i, rddIdx, part)
      i += 1
    }
    arr
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var i = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, i, rdd.partitions.length)
      i += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, tc: TaskContext): Iterator[S] = {
    val p = s.asInstanceOf[OriginUnionPartition]
    f(p.originIdx, p.originPart.index, parent[T](p.originIdx).iterator(p.originPart, tc))
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 90
Source File: ProtoParquetRDD.scala    From sparksql-protobuf   with Apache License 2.0 5 votes vote down vote up
package com.github.saurfang.parquet.proto.spark

import com.github.saurfang.parquet.proto.ProtoMessageParquetInputFormat
import com.google.protobuf.AbstractMessage
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
import org.apache.parquet.proto.ProtoReadSupport
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.{NewHadoopRDD, RDD}
import org.apache.spark.{Partition, SparkContext, TaskContext}

import scala.reflect.ClassTag

class ProtoParquetRDD[T <: AbstractMessage : ClassTag](
                                                        sc: SparkContext,
                                                        input: String,
                                                        protoClass: Class[T],
                                                        @transient conf: Configuration
                                                        ) extends RDD[T](sc, Nil) {

  def this(sc: SparkContext, input: String, protoClass: Class[T]) = {
    this(sc, input, protoClass, sc.hadoopConfiguration)
  }

  lazy private[this] val rdd = {
    val jconf = new JobConf(conf)
    FileInputFormat.setInputPaths(jconf, input)
    ProtoReadSupport.setProtobufClass(jconf, protoClass.getName)

    new NewHadoopRDD(sc, classOf[ProtoMessageParquetInputFormat[T]], classOf[Void], protoClass, jconf)
  }

  @DeveloperApi
  override def compute(split: Partition, context: TaskContext): Iterator[T] = rdd.compute(split, context).map(_._2)

  override protected def getPartitions: Array[Partition] = rdd.getPartitions
} 
Example 91
Source File: Neo4jRDD.scala    From morpheus   with Apache License 2.0 5 votes vote down vote up
package org.opencypher.morpheus.impl.io.neo4j.external

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.{Partition, SparkContext, TaskContext}
import org.opencypher.okapi.neo4j.io.Neo4jConfig

private class Neo4jRDD(
    sc: SparkContext,
    val query: String,
    val neo4jConfig: Neo4jConfig,
    val parameters: Map[String, Any] = Map.empty,
    partitions: Partitions = Partitions())
    extends RDD[Row](sc, Nil) {

  override def compute(partition: Partition, context: TaskContext): Iterator[Row] = {

    val neo4jPartition: Neo4jPartition = partition.asInstanceOf[Neo4jPartition]

    Executor.execute(neo4jConfig, query, parameters ++ neo4jPartition.window).sparkRows
  }

  override protected def getPartitions: Array[Partition] = {
    val p = partitions.effective()
    Range(0, p.partitions.toInt).map(idx => new Neo4jPartition(idx, p.skip(idx), p.limit(idx))).toArray
  }

  override def toString(): String = s"Neo4jRDD partitions $partitions $query using $parameters"
} 
Example 92
Source File: SlidingRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.rdd

import scala.collection.mutable
import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD

private[mllib]
class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T], val offset: Int)
  extends Partition with Serializable {
  override val index: Int = idx
}


private[mllib]
class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int)
  extends RDD[Array[T]](parent) {

  require(windowSize > 0 && step > 0 && !(windowSize == 1 && step == 1),
    "Window size and step must be greater than 0, " +
      s"and they cannot be both 1, but got windowSize = $windowSize and step = $step.")

  override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = {
    val part = split.asInstanceOf[SlidingRDDPartition[T]]
    (firstParent[T].iterator(part.prev, context) ++ part.tail)
      .drop(part.offset)
      .sliding(windowSize, step)
      .withPartial(false)
      .map(_.toArray)
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev)

  override def getPartitions: Array[Partition] = {
    val parentPartitions = parent.partitions
    val n = parentPartitions.length
    if (n == 0) {
      Array.empty
    } else if (n == 1) {
      Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty, 0))
    } else {
      val w1 = windowSize - 1
      // Get partition sizes and first w1 elements.
      val (sizes, heads) = parent.mapPartitions { iter =>
        val w1Array = iter.take(w1).toArray
        Iterator.single((w1Array.length + iter.length, w1Array))
      }.collect().unzip
      val partitions = mutable.ArrayBuffer.empty[SlidingRDDPartition[T]]
      var i = 0
      var cumSize = 0
      var partitionIndex = 0
      while (i < n) {
        val mod = cumSize % step
        val offset = if (mod == 0) 0 else step - mod
        val size = sizes(i)
        if (offset < size) {
          val tail = mutable.ListBuffer.empty[T]
          // Keep appending to the current tail until it has w1 elements.
          var j = i + 1
          while (j < n && tail.length < w1) {
            tail ++= heads(j).take(w1 - tail.length)
            j += 1
          }
          if (sizes(i) + tail.length >= offset + windowSize) {
            partitions +=
              new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail, offset)
            partitionIndex += 1
          }
        }
        cumSize += size
        i += 1
      }
      partitions.toArray
    }
  }

  // TODO: Override methods such as aggregate, which only requires one Spark job.
} 
Example 93
Source File: StateStoreRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.state

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.internal.SessionState
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.SerializableConfiguration


class StateStoreRDD[T: ClassTag, U: ClassTag](
    dataRDD: RDD[T],
    storeUpdateFunction: (StateStore, Iterator[T]) => Iterator[U],
    checkpointLocation: String,
    operatorId: Long,
    storeVersion: Long,
    keySchema: StructType,
    valueSchema: StructType,
    sessionState: SessionState,
    @transient private val storeCoordinator: Option[StateStoreCoordinatorRef])
  extends RDD[U](dataRDD) {

  private val storeConf = new StateStoreConf(sessionState.conf)

  // A Hadoop Configuration can be about 10 KB, which is pretty big, so broadcast it
  private val confBroadcast = dataRDD.context.broadcast(
    new SerializableConfiguration(sessionState.newHadoopConf()))

  override protected def getPartitions: Array[Partition] = dataRDD.partitions

  override def getPreferredLocations(partition: Partition): Seq[String] = {
    val storeId = StateStoreId(checkpointLocation, operatorId, partition.index)
    storeCoordinator.flatMap(_.getLocation(storeId)).toSeq
  }

  override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = {
    var store: StateStore = null
    val storeId = StateStoreId(checkpointLocation, operatorId, partition.index)
    store = StateStore.get(
      storeId, keySchema, valueSchema, storeVersion, storeConf, confBroadcast.value.value)
    val inputIter = dataRDD.iterator(partition, ctxt)
    storeUpdateFunction(store, inputIter)
  }
} 
Example 94
Source File: WholeTextFileRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.{Text, Writable}
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.task.JobContextImpl

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.WholeTextFileInputFormat


private[spark] class WholeTextFileRDD(
    sc : SparkContext,
    inputFormatClass: Class[_ <: WholeTextFileInputFormat],
    keyClass: Class[Text],
    valueClass: Class[Text],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    val conf = getConf
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = new JobContextImpl(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
} 
Example 95
Source File: ZippedWithIndexRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.Utils

private[spark]
class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


  @transient private val startIndices: Array[Long] = {
    val n = prev.partitions.length
    if (n == 0) {
      Array.empty
    } else if (n == 1) {
      Array(0L)
    } else {
      prev.context.runJob(
        prev,
        Utils.getIteratorSize _,
        0 until n - 1 // do not need to count the last partition
      ).scanLeft(0L)(_ + _)
    }
  }

  override def getPartitions: Array[Partition] = {
    firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index)))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = {
    val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition]
    val parentIter = firstParent[T].iterator(split.prev, context)
    Utils.getIteratorZipWithIndex(parentIter, split.startIndex)
  }
} 
Example 96
Source File: UnionRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport}
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 97
Source File: PartitionwiseSampledRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 98
Source File: PartitionerAwareUnionRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.nonEmpty)
  require(rdds.forall(_.partitioner.isDefined))
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map { index =>
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }.toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) =>
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
    }
    val location = if (locations.isEmpty) {
      None
    } else {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 99
Source File: BinaryFileRDD.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.Writable
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.task.JobContextImpl

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.StreamFileInputFormat

private[spark] class BinaryFileRDD[T](
    @transient private val sc: SparkContext,
    inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
    keyClass: Class[String],
    valueClass: Class[T],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    val conf = getConf
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = new JobContextImpl(conf, jobId)
    inputFormat.setMinPartitions(sc, jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
} 
Example 100
Source File: PartitionPruningRDDSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
} 
Example 101
Source File: SlidingRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.rdd

import scala.collection.mutable
import scala.reflect.ClassTag

import org.apache.spark.{TaskContext, Partition}
import org.apache.spark.rdd.RDD

private[mllib]
class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T])
  extends Partition with Serializable {
  override val index: Int = idx
}


private[mllib]
class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int)
  extends RDD[Array[T]](parent) {

  require(windowSize > 1, s"Window size must be greater than 1, but got $windowSize.")

  override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = {
    val part = split.asInstanceOf[SlidingRDDPartition[T]]
    (firstParent[T].iterator(part.prev, context) ++ part.tail)
      .sliding(windowSize)
      .withPartial(false)
      .map(_.toArray)
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev)

  override def getPartitions: Array[Partition] = {
    val parentPartitions = parent.partitions
    val n = parentPartitions.size
    if (n == 0) {
      Array.empty
    } else if (n == 1) {
      Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty))
    } else {
      val n1 = n - 1
      val w1 = windowSize - 1
      // Get the first w1 items of each partition, starting from the second partition.
      val nextHeads =
        parent.context.runJob(parent, (iter: Iterator[T]) => iter.take(w1).toArray, 1 until n, true)
      val partitions = mutable.ArrayBuffer[SlidingRDDPartition[T]]()
      var i = 0
      var partitionIndex = 0
      while (i < n1) {
        var j = i
        val tail = mutable.ListBuffer[T]()
        // Keep appending to the current tail until appended a head of size w1.
        while (j < n1 && nextHeads(j).size < w1) {
          tail ++= nextHeads(j)
          j += 1
        }
        if (j < n1) {
          tail ++= nextHeads(j)
          j += 1
        }
        partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail)
        partitionIndex += 1
        // Skip appended heads.
        i = j
      }
      // If the head of last partition has size w1, we also need to add this partition.
      if (nextHeads.last.size == w1) {
        partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(n1), Seq.empty)
      }
      partitions.toArray
    }
  }

  // TODO: Override methods such as aggregate, which only requires one Spark job.
} 
Example 102
Source File: KafkaRDDPartition.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.kafka

import org.apache.spark.Partition


private[kafka]
class KafkaRDDPartition(
  val index: Int,
  val topic: String,
  val partition: Int,
  val fromOffset: Long,
  val untilOffset: Long,
  val host: String,
  val port: Int
) extends Partition 
Example 103
Source File: SampledRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.PoissonDistribution

import org.apache.spark.{Partition, TaskContext}

@deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0")
private[spark]
class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable {
  override val index: Int = prev.index
}

@deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0")
private[spark] class SampledRDD[T: ClassTag](
    prev: RDD[T],
    withReplacement: Boolean,
    frac: Double,
    seed: Int)
  extends RDD[T](prev) {

  override def getPartitions: Array[Partition] = {
    val rg = new Random(seed)
    firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = {
    val split = splitIn.asInstanceOf[SampledRDDPartition]
    if (withReplacement) {
      // For large datasets, the expected number of occurrences of each element in a sample with
      // replacement is Poisson(frac). We use that to get a count for each element.
      val poisson = new PoissonDistribution(frac)
      poisson.reseedRandomGenerator(split.seed)

      firstParent[T].iterator(split.prev, context).flatMap { element =>
        val count = poisson.sample()
        if (count == 0) {
          Iterator.empty  // Avoid object allocation when we return 0 items, which is quite often
        } else {
          Iterator.fill(count)(element)
        }
      }
    } else { // Sampling without replacement
      val rand = new Random(split.seed)
      firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac))
    }
  }
} 
Example 104
Source File: ZippedWithIndexRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.Utils

private[spark]
class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


  @transient private val startIndices: Array[Long] = {
    val n = prev.partitions.length
    if (n == 0) {
      Array[Long]()
    } else if (n == 1) {
      Array(0L)
    } else {
      prev.context.runJob(
        prev,
        Utils.getIteratorSize _,
        0 until n - 1, // do not need to count the last partition
        allowLocal = false
      ).scanLeft(0L)(_ + _)
    }
  }

  override def getPartitions: Array[Partition] = {
    firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index)))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = {
    val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition]
    firstParent[T].iterator(split.prev, context).zipWithIndex.map { x =>
      (x._1, split.startIndex + x._2)
    }
  }
} 
Example 105
Source File: UnionRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient rdd: RDD[T],
    val parentRddIndex: Int,
    @transient parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdds.map(_.partitions.length).sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 106
Source File: PartitionwiseSampledRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    @transient preservesPartitioning: Boolean,
    @transient seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 107
Source File: PartitionerAwareUnionRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.length > 0)
  require(rdds.forall(_.partitioner.isDefined))
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map(index => {
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }).toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) => {
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
      }
    }
    val location = if (locations.isEmpty) {
      None
    } else {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 108
Source File: BinaryFileRDD.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.hadoop.conf.{ Configurable, Configuration }
import org.apache.hadoop.io.Writable
import org.apache.hadoop.mapreduce._
import org.apache.spark.input.StreamFileInputFormat
import org.apache.spark.{ Partition, SparkContext }

private[spark] class BinaryFileRDD[T](
    sc: SparkContext,
    inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
    keyClass: Class[String],
    valueClass: Class[T],
    @transient conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = newJobContext(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
} 
Example 109
Source File: PartitionPruningRDDSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
} 
Example 110
Source File: HBaseSimpleRDD.scala    From spark-hbase-connector   with Apache License 2.0 5 votes vote down vote up
package it.nerdammer.spark.hbase

import it.nerdammer.spark.hbase.conversion.FieldReader
import org.apache.hadoop.hbase.CellUtil
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.rdd.{NewHadoopRDD, RDD}
import org.apache.spark.{Partition, TaskContext}

import scala.reflect.ClassTag

class HBaseSimpleRDD[R: ClassTag](hadoopHBase: NewHadoopRDD[ImmutableBytesWritable, Result], builder: HBaseReaderBuilder[R], saltingLength: Int = 0)
                       (implicit mapper: FieldReader[R], saltingProvider: SaltingProviderFactory[String]) extends RDD[R](hadoopHBase) {

  override def getPartitions: Array[Partition] = firstParent[(ImmutableBytesWritable, Result)].partitions

  override def compute(split: Partition, context: TaskContext) = {
    // val cleanConversion = sc.clean ---> next version
    firstParent[(ImmutableBytesWritable, Result)].iterator(split, context)
      .map(e => conversion(e._1, e._2))
  }

  def conversion(key: ImmutableBytesWritable, row: Result) = {

    val columnNames = HBaseUtils.chosenColumns(builder.columns, mapper.columns)

    val columnNamesFC = HBaseUtils.columnsWithFamily(builder.columnFamily, columnNames)

    val columns = columnNamesFC
      .map(t => (Bytes.toBytes(t._1), Bytes.toBytes(t._2)))
      .map(t => if(row.containsColumn(t._1, t._2)) Some(CellUtil.cloneValue(row.getColumnLatestCell(t._1, t._2)).array) else None)
      .toList

    mapper.map(Some(key.get.drop(saltingLength)) :: columns)
  }
} 
Example 111
Source File: SlidingRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.rdd

import scala.collection.mutable
import scala.reflect.ClassTag

import org.apache.spark.{TaskContext, Partition}
import org.apache.spark.rdd.RDD

private[mllib]
class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T])
  extends Partition with Serializable {
  override val index: Int = idx
}


private[mllib]
class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int)
  extends RDD[Array[T]](parent) {

  require(windowSize > 1, s"Window size must be greater than 1, but got $windowSize.")

  override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = {
    val part = split.asInstanceOf[SlidingRDDPartition[T]]
    (firstParent[T].iterator(part.prev, context) ++ part.tail)
      .sliding(windowSize)
      .withPartial(false)
      .map(_.toArray)
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev)

  override def getPartitions: Array[Partition] = {
    val parentPartitions = parent.partitions
    val n = parentPartitions.size
    if (n == 0) {
      Array.empty
    } else if (n == 1) {
      Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty))
    } else {
      val n1 = n - 1
      val w1 = windowSize - 1
      // Get the first w1 items of each partition, starting from the second partition.
      val nextHeads =
        parent.context.runJob(parent, (iter: Iterator[T]) => iter.take(w1).toArray, 1 until n)
      val partitions = mutable.ArrayBuffer[SlidingRDDPartition[T]]()
      var i = 0
      var partitionIndex = 0
      while (i < n1) {
        var j = i
        val tail = mutable.ListBuffer[T]()
        // Keep appending to the current tail until appended a head of size w1.
        while (j < n1 && nextHeads(j).size < w1) {
          tail ++= nextHeads(j)
          j += 1
        }
        if (j < n1) {
          tail ++= nextHeads(j)
          j += 1
        }
        partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail)
        partitionIndex += 1
        // Skip appended heads.
        i = j
      }
      // If the head of last partition has size w1, we also need to add this partition.
      if (nextHeads.last.size == w1) {
        partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(n1), Seq.empty)
      }
      partitions.toArray
    }
  }

  // TODO: Override methods such as aggregate, which only requires one Spark job.
} 
Example 112
Source File: JDBCRelation.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.jdbc

import java.util.Properties

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.Partition
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}


  def columnPartition(partitioning: JDBCPartitioningInfo): Array[Partition] = {
    if (partitioning == null) return Array[Partition](JDBCPartition(null, 0))

    val numPartitions = partitioning.numPartitions
    val column = partitioning.column
    if (numPartitions == 1) return Array[Partition](JDBCPartition(null, 0))
    // Overflow and silliness can happen if you subtract then divide.
    // Here we get a little roundoff, but that's (hopefully) OK.
    val stride: Long = (partitioning.upperBound / numPartitions
                      - partitioning.lowerBound / numPartitions)
    var i: Int = 0
    var currentValue: Long = partitioning.lowerBound
    var ans = new ArrayBuffer[Partition]()
    while (i < numPartitions) {
      val lowerBound = if (i != 0) s"$column >= $currentValue" else null
      currentValue += stride
      val upperBound = if (i != numPartitions - 1) s"$column < $currentValue" else null
      val whereClause =
        if (upperBound == null) {
          lowerBound
        } else if (lowerBound == null) {
          upperBound
        } else {
          s"$lowerBound AND $upperBound"
        }
      ans += JDBCPartition(whereClause, i)
      i = i + 1
    }
    ans.toArray
  }
}

private[sql] case class JDBCRelation(
    url: String,
    table: String,
    parts: Array[Partition],
    properties: Properties = new Properties())(@transient val sqlContext: SQLContext)
  extends BaseRelation
  with PrunedFilteredScan
  with InsertableRelation {

  override val needConversion: Boolean = false

  override val schema: StructType = JDBCRDD.resolveTable(url, table, properties)

  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
    val driver: String = DriverRegistry.getDriverClassName(url)
    // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row]
    JDBCRDD.scanTable(
      sqlContext.sparkContext,
      schema,
      driver,
      url,
      properties,
      table,
      requiredColumns,
      filters,
      parts).asInstanceOf[RDD[Row]]
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    data.write
      .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
      .jdbc(url, table, properties)
  }
} 
Example 113
Source File: SampledRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.PoissonDistribution

import org.apache.spark.{Partition, TaskContext}

@deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0")
private[spark]
class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable {
  override val index: Int = prev.index
}

@deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0")
private[spark] class SampledRDD[T: ClassTag](
    prev: RDD[T],
    withReplacement: Boolean,
    frac: Double,
    seed: Int)
  extends RDD[T](prev) {

  override def getPartitions: Array[Partition] = {
    val rg = new Random(seed)
    firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = {
    val split = splitIn.asInstanceOf[SampledRDDPartition]
    if (withReplacement) {
      // For large datasets, the expected number of occurrences of each element in a sample with
      // replacement is Poisson(frac). We use that to get a count for each element.
      //对于大型数据集,替换样本中每个元素的预期出现次数为泊松(压缩),我们使用它来获取每个元素的计数
      val poisson = new PoissonDistribution(frac)
      poisson.reseedRandomGenerator(split.seed)

      firstParent[T].iterator(split.prev, context).flatMap { element =>
        val count = poisson.sample()
        if (count == 0) {
          //当我们返回0个项目时,避免对象分配,这是很经常的
          Iterator.empty  // Avoid object allocation when we return 0 items, which is quite often
        } else {
          Iterator.fill(count)(element)
        }
      }
    } else { // Sampling without replacement
      val rand = new Random(split.seed)
      firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac))
    }
  }
} 
Example 114
Source File: ZippedWithIndexRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.Utils

private[spark]
class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


  @transient private val startIndices: Array[Long] = {
    val n = prev.partitions.length
    if (n == 0) {
      Array[Long]()
    } else if (n == 1) {
      Array(0L)
    } else {
      prev.context.runJob(
        prev,
        Utils.getIteratorSize _,
        0 until n - 1 // do not need to count the last partition
      ).scanLeft(0L)(_ + _)
    }
  }

  override def getPartitions: Array[Partition] = {
    firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index)))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = {
    val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition]
    firstParent[T].iterator(split.prev, context).zipWithIndex.map { x =>
      (x._1, split.startIndex + x._2)
    }
  }
} 
Example 115
Source File: MapPartitionsWithPreparationRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Partition, Partitioner, TaskContext}


  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val prepared =
      if (preparedArguments.isEmpty) {
        preparePartition()
      } else {
        preparedArguments.remove(0)
      }
    val parentIterator = firstParent[T].iterator(partition, context)
    executePartition(context, partition.index, prepared, parentIterator)
  }
} 
Example 116
Source File: MapPartitionsRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}


private[spark] class MapPartitionsRDD[U: ClassTag, T: ClassTag](
    prev: RDD[T],
    f: (TaskContext, Int, Iterator[T]) => Iterator[U],  // (TaskContext, partition index, iterator)
    preservesPartitioning: Boolean = false)
    //这里this,就是之前生成的HadoopRDD,MapPartitionsRDD的构造函数,会调用父类的构造函数RDD[U](prev), 
    //这个this(例如也就是hadoopRdd),会被赋值给prev,然后调用RDD.scala
  extends RDD[U](prev) {

  override val partitioner = if (preservesPartitioning) firstParent[T].partitioner else None
  //firstParent用于返回依赖的第一个父RDD,
  override def getPartitions: Array[Partition] = firstParent[T].partitions
  //首先调用firstParent找到父RDD
  override def compute(split: Partition, context: TaskContext): Iterator[U] = f(context, split.index, firstParent[T].iterator(split, context))
} 
Example 117
Source File: UnionRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient rdd: RDD[T],
    val parentRddIndex: Int,
    @transient parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    //在任务序列化时更新对父拆分的引用
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdds.map(_.partitions.length).sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 118
Source File: PartitionwiseSampledRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    @transient preservesPartitioning: Boolean,
    @transient seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 119
Source File: CheckpointRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partition, SparkContext, TaskContext}


private[spark] abstract class CheckpointRDD[T: ClassTag](@transient sc: SparkContext)
  extends RDD[T](sc, Nil) {

  // CheckpointRDD should not be checkpointed again
  //CheckpointRDD不应再次检查点
  override def doCheckpoint(): Unit = { }
  override def checkpoint(): Unit = { }
  //this.type表示当前对象(this)的类型,this指代当前的对象
  override def localCheckpoint(): this.type = this

  // Note: There is a bug in MiMa that complains about `AbstractMethodProblem`s in the
  // base [[org.apache.spark.rdd.RDD]] class if we do not override the following methods.
  //注意:如果我们不覆盖以下方法,那么MiMa中有一个Bug在基础[[org.apache.spark.rdd.RDD]]类中引用了`AbstractMethodProblem`s)
  // scalastyle:off
  protected override def getPartitions: Array[Partition] = ???
  override def compute(p: Partition, tc: TaskContext): Iterator[T] = ???
  // scalastyle:on

} 
Example 120
Source File: BinaryFileRDD.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.hadoop.conf.{ Configurable, Configuration }
import org.apache.hadoop.io.Writable
import org.apache.hadoop.mapreduce._
import org.apache.spark.input.StreamFileInputFormat
import org.apache.spark.{ Partition, SparkContext }

private[spark] class BinaryFileRDD[T](
    sc: SparkContext,
    inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
    keyClass: Class[String],
    valueClass: Class[T],
    @transient conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    val conf = getConf
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = newJobContext(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
} 
Example 121
Source File: PartitionPruningRDDSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {//修剪的分区设置的正确性

    val rdd = new RDD[Int](sc, Nil) {//列表结尾为Nil
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {//修剪分区可以联合
  //列表结尾为Nil
    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
} 
Example 122
Source File: HBasePartition.scala    From Heracles   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.regionserver.RegionScanner
import org.apache.spark.Partition
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.expressions.PartialPredicateOperations._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.hbase.types.HBaseBytesType
import org.apache.spark.sql.types.Range

import scala.annotation.meta.param

private[hbase] class HBasePartition(
                                     val idx: Int, val mappedIndex: Int,
                                     start: Option[HBaseRawType] = None,
                                     end: Option[HBaseRawType] = None,
                                     val server: Option[String] = None,
                                     val filterPredicates: Option[Expression] = None,
                                     @(transient @param) relation: HBaseRelation = null,
                                     @(transient @param) val newScanner:RegionScanner = null)
  extends Range[HBaseRawType](start, true, end, false, HBaseBytesType)
  with Partition with IndexMappable with Logging {

  override def index: Int = idx

  override def hashCode(): Int = idx

  @transient lazy val startNative: Seq[Any] = relation.nativeKeyConvert(start)

  @transient lazy val endNative: Seq[Any] = relation.nativeKeyConvert(end)

  
  def computePredicate(relation: HBaseRelation): Option[Expression] = {
    val predicate = if (filterPredicates.isDefined &&
      filterPredicates.get.references.exists(_.exprId == relation.partitionKeys.head.exprId)) {
      val oriPredicate = filterPredicates.get
      val predicateReferences = oriPredicate.references.toSeq
      val boundReference = BindReferences.bindReference(oriPredicate, predicateReferences)
      val row = new GenericInternalRow(predicateReferences.size)
      var rowIndex = 0
      var i = 0
      var range: Range[_] = null
      while (i < relation.keyColumns.size) {
        range = relation.generateRange(this, oriPredicate, i)
        if (range != null) {
          rowIndex = relation.rowIndex(predicateReferences, i)
          if (rowIndex >= 0) row.update(rowIndex, range)
          // if the non-last dimension range is not point, do not proceed to the next dims
          if (i < relation.keyColumns.size - 1 && !range.isPoint) i = relation.keyColumns.size
          else i = i + 1
        } else i = relation.keyColumns.size
      }
      val pr = boundReference.partialReduce(row, predicateReferences)
      pr match {
        case (null, e: Expression) => Some(e)
        case (true, _) => None
        case (false, _) => Some(Literal(false))
      }
    } else filterPredicates
    logInfo(predicate.toString)
    predicate
  }

  override def toString = {
    s"HBasePartition: $idx, $mappedIndex, [$start, $end), $filterPredicates"
  }
} 
Example 123
Source File: HBasePartition.scala    From Spark-SQL-on-HBase   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import org.apache.hadoop.hbase.regionserver.RegionScanner
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.hbase.catalyst.expressions.PartialPredicateOperations._
import org.apache.spark.sql.hbase.types.{HBaseBytesType, Range}
import org.apache.spark.{Logging, Partition}


private[hbase] class HBasePartition(
                                     val idx: Int, val mappedIndex: Int,
                                     start: Option[HBaseRawType] = None,
                                     end: Option[HBaseRawType] = None,
                                     val server: Option[String] = None,
                                     val filterPredicates: Option[Expression] = None,
                                     @transient relation: HBaseRelation = null,
                                     @transient val newScanner:RegionScanner = null)
  extends Range[HBaseRawType](start, true, end, false, HBaseBytesType)
  with Partition with IndexMappable with Logging {

  override def index: Int = idx

  override def hashCode(): Int = idx

  @transient lazy val startNative: Seq[Any] = relation.nativeKeyConvert(start)

  @transient lazy val endNative: Seq[Any] = relation.nativeKeyConvert(end)

  
  def computePredicate(relation: HBaseRelation): Option[Expression] = {
    val predicate = if (filterPredicates.isDefined &&
      filterPredicates.get.references.exists(_.exprId == relation.partitionKeys.head.exprId)) {
      val oriPredicate = filterPredicates.get
      val predicateReferences = oriPredicate.references.toSeq
      val boundReference = BindReferences.bindReference(oriPredicate, predicateReferences)
      val row = new GenericMutableRow(predicateReferences.size)
      var rowIndex = 0
      var i = 0
      var range: Range[_] = null
      while (i < relation.keyColumns.size) {
        range = relation.generateRange(this, oriPredicate, i)
        if (range != null) {
          rowIndex = relation.rowIndex(predicateReferences, i)
          if (rowIndex >= 0) row.update(rowIndex, range)
          // if the non-last dimension range is not point, do not proceed to the next dims
          if (i < relation.keyColumns.size - 1 && !range.isPoint) i = relation.keyColumns.size
          else i = i + 1
        } else i = relation.keyColumns.size
      }
      val pr = boundReference.partialReduce(row, predicateReferences)
      pr match {
        case (null, e: Expression) => Some(e)
        case (true, _) => None
        case (false, _) => Some(Literal(false))
      }
    } else filterPredicates
    logInfo(predicate.toString)
    predicate
  }

  override def toString = {
    s"HBasePartition: $idx, $mappedIndex, [$start, $end), $filterPredicates"
  }
} 
Example 124
Source File: ClassRDDPartitioner.scala    From spark-orientdb-connector   with Apache License 2.0 5 votes vote down vote up
package com.metreta.spark.orientdb.connector.rdd.partitioner

import scala.collection.JavaConversions.iterableAsScalaIterable
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.Logging
import org.apache.spark.Partition

import com.metreta.spark.orientdb.connector.api.OrientDBConnector
import com.orientechnologies.orient.core.metadata.schema.OClass
import com.orientechnologies.orient.core.metadata.schema.OSchema
import com.orientechnologies.orient.core.storage.OStorage
import com.metreta.spark.orientdb.connector.SystemTables
import scala.collection.JavaConversions.iterableAsScalaIterable


  def getPartitions(): Array[Partition] = {
    
	val db = connector.databaseDocumentTx()

    var partitions = new ArrayBuffer[OrientPartition]
    val schema: OSchema = connector.getSchema(db)
    var klass: OClass = schema.getClass(mClass)
    val storage: OStorage = connector.getStorage(db)
    klass.getClusterIds.zipWithIndex foreach {
      case (clusterId, index) => partitions = partitions.+=(OrientPartition(
        index,
        null, // <- Host Address ?????
        PartitionName(klass.getName, storage.getClusterById(clusterId).getName)))
    }
  partitions.toArray
  }

} 
Example 125
Source File: SlidingRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.rdd

import scala.collection.mutable
import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD

private[mllib]
class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T], val offset: Int)
  extends Partition with Serializable {
  override val index: Int = idx
}


private[mllib]
class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int)
  extends RDD[Array[T]](parent) {

  require(windowSize > 0 && step > 0 && !(windowSize == 1 && step == 1),
    "Window size and step must be greater than 0, " +
      s"and they cannot be both 1, but got windowSize = $windowSize and step = $step.")

  override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = {
    val part = split.asInstanceOf[SlidingRDDPartition[T]]
    (firstParent[T].iterator(part.prev, context) ++ part.tail)
      .drop(part.offset)
      .sliding(windowSize, step)
      .withPartial(false)
      .map(_.toArray)
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev)

  override def getPartitions: Array[Partition] = {
    val parentPartitions = parent.partitions
    val n = parentPartitions.length
    if (n == 0) {
      Array.empty
    } else if (n == 1) {
      Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty, 0))
    } else {
      val w1 = windowSize - 1
      // Get partition sizes and first w1 elements.
      val (sizes, heads) = parent.mapPartitions { iter =>
        val w1Array = iter.take(w1).toArray
        Iterator.single((w1Array.length + iter.length, w1Array))
      }.collect().unzip
      val partitions = mutable.ArrayBuffer.empty[SlidingRDDPartition[T]]
      var i = 0
      var cumSize = 0
      var partitionIndex = 0
      while (i < n) {
        val mod = cumSize % step
        val offset = if (mod == 0) 0 else step - mod
        val size = sizes(i)
        if (offset < size) {
          val tail = mutable.ListBuffer.empty[T]
          // Keep appending to the current tail until it has w1 elements.
          var j = i + 1
          while (j < n && tail.length < w1) {
            tail ++= heads(j).take(w1 - tail.length)
            j += 1
          }
          if (sizes(i) + tail.length >= offset + windowSize) {
            partitions +=
              new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail, offset)
            partitionIndex += 1
          }
        }
        cumSize += size
        i += 1
      }
      partitions.toArray
    }
  }

  // TODO: Override methods such as aggregate, which only requires one Spark job.
} 
Example 126
Source File: DataSourceRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.v2

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.v2.reader.DataReaderFactory

class DataSourceRDDPartition[T : ClassTag](val index: Int, val readerFactory: DataReaderFactory[T])
  extends Partition with Serializable

class DataSourceRDD[T: ClassTag](
    sc: SparkContext,
    @transient private val readerFactories: java.util.List[DataReaderFactory[T]])
  extends RDD[T](sc, Nil) {

  override protected def getPartitions: Array[Partition] = {
    readerFactories.asScala.zipWithIndex.map {
      case (readerFactory, index) => new DataSourceRDDPartition(index, readerFactory)
    }.toArray
  }

  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
    val reader = split.asInstanceOf[DataSourceRDDPartition[T]].readerFactory.createDataReader()
    context.addTaskCompletionListener(_ => reader.close())
    val iter = new Iterator[T] {
      private[this] var valuePrepared = false

      override def hasNext: Boolean = {
        if (!valuePrepared) {
          valuePrepared = reader.next()
        }
        valuePrepared
      }

      override def next(): T = {
        if (!hasNext) {
          throw new java.util.NoSuchElementException("End of stream")
        }
        valuePrepared = false
        reader.get()
      }
    }
    new InterruptibleIterator(context, iter)
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    split.asInstanceOf[DataSourceRDDPartition[T]].readerFactory.preferredLocations()
  }
} 
Example 127
Source File: StateStoreRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.state

import java.util.UUID

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.internal.SessionState
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.SerializableConfiguration


  override def getPreferredLocations(partition: Partition): Seq[String] = {
    val stateStoreProviderId = StateStoreProviderId(
      StateStoreId(checkpointLocation, operatorId, partition.index),
      queryRunId)
    storeCoordinator.flatMap(_.getLocation(stateStoreProviderId)).toSeq
  }

  override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = {
    var store: StateStore = null
    val storeProviderId = StateStoreProviderId(
      StateStoreId(checkpointLocation, operatorId, partition.index),
      queryRunId)

    store = StateStore.get(
      storeProviderId, keySchema, valueSchema, indexOrdinal, storeVersion,
      storeConf, hadoopConfBroadcast.value.value)
    val inputIter = dataRDD.iterator(partition, ctxt)
    storeUpdateFunction(store, inputIter)
  }
} 
Example 128
Source File: WholeTextFileRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.{Text, Writable}
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.task.JobContextImpl

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.WholeTextFileInputFormat


private[spark] class WholeTextFileRDD(
    sc : SparkContext,
    inputFormatClass: Class[_ <: WholeTextFileInputFormat],
    keyClass: Class[Text],
    valueClass: Class[Text],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val conf = getConf
    // setMinPartitions below will call FileInputFormat.listStatus(), which can be quite slow when
    // traversing a large number of directories and files. Parallelize it.
    conf.setIfUnset(FileInputFormat.LIST_STATUS_NUM_THREADS,
      Runtime.getRuntime.availableProcessors().toString)
    val inputFormat = inputFormatClass.newInstance
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = new JobContextImpl(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
} 
Example 129
Source File: ZippedWithIndexRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.Utils

private[spark]
class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


  @transient private val startIndices: Array[Long] = {
    val n = prev.partitions.length
    if (n == 0) {
      Array.empty
    } else if (n == 1) {
      Array(0L)
    } else {
      prev.context.runJob(
        prev,
        Utils.getIteratorSize _,
        0 until n - 1 // do not need to count the last partition
      ).scanLeft(0L)(_ + _)
    }
  }

  override def getPartitions: Array[Partition] = {
    firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index)))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = {
    val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition]
    val parentIter = firstParent[T].iterator(split.prev, context)
    Utils.getIteratorZipWithIndex(parentIter, split.startIndex)
  }
} 
Example 130
Source File: UnionRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.ForkJoinTaskSupport
import scala.concurrent.forkjoin.ForkJoinPool
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

object UnionRDD {
  private[spark] lazy val partitionEvalTaskSupport =
    new ForkJoinTaskSupport(new ForkJoinPool(8))
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  // visible for testing
  private[spark] val isPartitionListingParallel: Boolean =
    rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10)

  override def getPartitions: Array[Partition] = {
    val parRDDs = if (isPartitionListingParallel) {
      val parArray = rdds.par
      parArray.tasksupport = UnionRDD.partitionEvalTaskSupport
      parArray
    } else {
      rdds
    }
    val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 131
Source File: PartitionwiseSampledRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.Utils
import org.apache.spark.util.random.RandomSampler

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 132
Source File: PartitionerAwareUnionRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.nonEmpty)
  require(rdds.forall(_.partitioner.isDefined))
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map { index =>
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }.toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) =>
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
    }
    val location = if (locations.isEmpty) {
      None
    } else {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 133
Source File: BinaryFileRDD.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.Writable
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.hadoop.mapreduce.task.JobContextImpl

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.StreamFileInputFormat

private[spark] class BinaryFileRDD[T](
    @transient private val sc: SparkContext,
    inputFormatClass: Class[_ <: StreamFileInputFormat[T]],
    keyClass: Class[String],
    valueClass: Class[T],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val conf = getConf
    // setMinPartitions below will call FileInputFormat.listStatus(), which can be quite slow when
    // traversing a large number of directories and files. Parallelize it.
    conf.setIfUnset(FileInputFormat.LIST_STATUS_NUM_THREADS,
      Runtime.getRuntime.availableProcessors().toString)
    val inputFormat = inputFormatClass.newInstance
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = new JobContextImpl(conf, jobId)
    inputFormat.setMinPartitions(sc, jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
} 
Example 134
Source File: FakeTask.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.util.Properties

import org.apache.spark.{Partition, SparkEnv, TaskContext}
import org.apache.spark.executor.TaskMetrics

class FakeTask(
    stageId: Int,
    partitionId: Int,
    prefLocs: Seq[TaskLocation] = Nil,
    serializedTaskMetrics: Array[Byte] =
      SparkEnv.get.closureSerializer.newInstance().serialize(TaskMetrics.registered).array())
  extends Task[Int](stageId, 0, partitionId, new Properties, serializedTaskMetrics) {

  override def runTask(context: TaskContext): Int = 0
  override def preferredLocations: Seq[TaskLocation] = prefLocs
}

object FakeTask {
  
  def createTaskSet(numTasks: Int, prefLocs: Seq[TaskLocation]*): TaskSet = {
    createTaskSet(numTasks, stageAttemptId = 0, prefLocs: _*)
  }

  def createTaskSet(numTasks: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): TaskSet = {
    createTaskSet(numTasks, stageId = 0, stageAttemptId, prefLocs: _*)
  }

  def createTaskSet(numTasks: Int, stageId: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*):
  TaskSet = {
    if (prefLocs.size != 0 && prefLocs.size != numTasks) {
      throw new IllegalArgumentException("Wrong number of task locations")
    }
    val tasks = Array.tabulate[Task[_]](numTasks) { i =>
      new FakeTask(stageId, i, if (prefLocs.size != 0) prefLocs(i) else Nil)
    }
    new TaskSet(tasks, stageId, stageAttemptId, priority = 0, null)
  }

  def createShuffleMapTaskSet(
      numTasks: Int,
      stageId: Int,
      stageAttemptId: Int,
      prefLocs: Seq[TaskLocation]*): TaskSet = {
    if (prefLocs.size != 0 && prefLocs.size != numTasks) {
      throw new IllegalArgumentException("Wrong number of task locations")
    }
    val tasks = Array.tabulate[Task[_]](numTasks) { i =>
      new ShuffleMapTask(stageId, stageAttemptId, null, new Partition {
        override def index: Int = i
      }, prefLocs(i), new Properties,
        SparkEnv.get.closureSerializer.newInstance().serialize(TaskMetrics.registered).array())
    }
    new TaskSet(tasks, stageId, stageAttemptId, priority = 0, null)
  }
} 
Example 135
Source File: PartitionPruningRDDSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
} 
Example 136
Source File: StarryRDD.scala    From starry   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.spark.{Partition, SparkContext, TaskContext}

import scala.reflect.ClassTag


class StarryRDD[T: ClassTag](sc: SparkContext,
                             rddName: String,
                             @transient private var data: Seq[T]
                            ) extends RDD[T](sc, Nil) {
  def this (sc: SparkContext, data: Seq[T]) = {
    this (sc, getClass.getSimpleName, data)
  }

  setName(rddName)

  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
    split.asInstanceOf[ParallelCollectionPartition[T]].iterator
  }

  def updateData(data: Seq[T]): Unit = {
    this.data = data
    this.markCheckpointed()
  }

  override protected def getPartitions: Array[Partition] = {
    Array(new ParallelCollectionPartition(id, 0, data))
  }
} 
Example 137
Source File: SparkPlanExecutor.scala    From starry   with Apache License 2.0 5 votes vote down vote up
package com.github.passionke.starry

import org.apache.spark.{Partition, StarryTaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.{ReuseSubquery, SparkPlan}


object SparkPlanExecutor {

  def exec(plan: SparkPlan, sparkSession: SparkSession) = {
    val newPlan = Seq(
      ReuseSubquery(sparkSession.sessionState.conf))
      .foldLeft(plan) { case (sp, rule) => rule.apply(sp) }
    doExec(newPlan)
  }

  def firstPartition(rdd: RDD[InternalRow]): Partition = {
    rdd.partitions.head
  }

  def doExec(sparkPlan: SparkPlan): List[InternalRow] = {
    val rdd = sparkPlan.execute().map(ite => ite.copy())
    val partition = firstPartition(rdd)
    rdd.compute(partition, new StarryTaskContext).toList
  }

  def rddCompute(rdd: RDD[InternalRow]): List[InternalRow] = {
    val partition = firstPartition(rdd)
    rdd.compute(partition, new StarryTaskContext).toList
  }

} 
Example 138
Source File: SlidingRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.rdd

import scala.collection.mutable
import scala.reflect.ClassTag

import org.apache.spark.{TaskContext, Partition}
import org.apache.spark.rdd.RDD

private[mllib]
class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T], val offset: Int)
  extends Partition with Serializable {
  override val index: Int = idx
}


private[mllib]
class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int)
  extends RDD[Array[T]](parent) {

  require(windowSize > 0 && step > 0 && !(windowSize == 1 && step == 1),
    "Window size and step must be greater than 0, " +
      s"and they cannot be both 1, but got windowSize = $windowSize and step = $step.")

  override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = {
    val part = split.asInstanceOf[SlidingRDDPartition[T]]
    (firstParent[T].iterator(part.prev, context) ++ part.tail)
      .drop(part.offset)
      .sliding(windowSize, step)
      .withPartial(false)
      .map(_.toArray)
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev)

  override def getPartitions: Array[Partition] = {
    val parentPartitions = parent.partitions
    val n = parentPartitions.length
    if (n == 0) {
      Array.empty
    } else if (n == 1) {
      Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty, 0))
    } else {
      val w1 = windowSize - 1
      // Get partition sizes and first w1 elements.
      val (sizes, heads) = parent.mapPartitions { iter =>
        val w1Array = iter.take(w1).toArray
        Iterator.single((w1Array.length + iter.length, w1Array))
      }.collect().unzip
      val partitions = mutable.ArrayBuffer.empty[SlidingRDDPartition[T]]
      var i = 0
      var cumSize = 0
      var partitionIndex = 0
      while (i < n) {
        val mod = cumSize % step
        val offset = if (mod == 0) 0 else step - mod
        val size = sizes(i)
        if (offset < size) {
          val tail = mutable.ListBuffer.empty[T]
          // Keep appending to the current tail until it has w1 elements.
          var j = i + 1
          while (j < n && tail.length < w1) {
            tail ++= heads(j).take(w1 - tail.length)
            j += 1
          }
          if (sizes(i) + tail.length >= offset + windowSize) {
            partitions +=
              new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail, offset)
            partitionIndex += 1
          }
        }
        cumSize += size
        i += 1
      }
      partitions.toArray
    }
  }

  // TODO: Override methods such as aggregate, which only requires one Spark job.
} 
Example 139
Source File: JDBCRelation.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.jdbc

import java.util.Properties

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.Partition
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}


  def columnPartition(partitioning: JDBCPartitioningInfo): Array[Partition] = {
    if (partitioning == null) return Array[Partition](JDBCPartition(null, 0))

    val numPartitions = partitioning.numPartitions
    val column = partitioning.column
    if (numPartitions == 1) return Array[Partition](JDBCPartition(null, 0))
    // Overflow and silliness can happen if you subtract then divide.
    // Here we get a little roundoff, but that's (hopefully) OK.
    val stride: Long = (partitioning.upperBound / numPartitions
                      - partitioning.lowerBound / numPartitions)
    var i: Int = 0
    var currentValue: Long = partitioning.lowerBound
    var ans = new ArrayBuffer[Partition]()
    while (i < numPartitions) {
      val lowerBound = if (i != 0) s"$column >= $currentValue" else null
      currentValue += stride
      val upperBound = if (i != numPartitions - 1) s"$column < $currentValue" else null
      val whereClause =
        if (upperBound == null) {
          lowerBound
        } else if (lowerBound == null) {
          upperBound
        } else {
          s"$lowerBound AND $upperBound"
        }
      ans += JDBCPartition(whereClause, i)
      i = i + 1
    }
    ans.toArray
  }
}

private[sql] case class JDBCRelation(
    url: String,
    table: String,
    parts: Array[Partition],
    properties: Properties = new Properties())(@transient val sqlContext: SQLContext)
  extends BaseRelation
  with PrunedFilteredScan
  with InsertableRelation {

  override val needConversion: Boolean = false

  override val schema: StructType = JDBCRDD.resolveTable(url, table, properties)

  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
    // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row]
    JDBCRDD.scanTable(
      sqlContext.sparkContext,
      schema,
      url,
      properties,
      table,
      requiredColumns,
      filters,
      parts).asInstanceOf[RDD[Row]]
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    data.write
      .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
      .jdbc(url, table, properties)
  }
} 
Example 140
Source File: SampledRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.PoissonDistribution

import org.apache.spark.{Partition, TaskContext}

@deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0")
private[spark]
class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable {
  override val index: Int = prev.index
}

@deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0")
private[spark] class SampledRDD[T: ClassTag](
    prev: RDD[T],
    withReplacement: Boolean,
    frac: Double,
    seed: Int)
  extends RDD[T](prev) {

  override def getPartitions: Array[Partition] = {
    val rg = new Random(seed)
    firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = {
    val split = splitIn.asInstanceOf[SampledRDDPartition]
    if (withReplacement) {
      // For large datasets, the expected number of occurrences of each element in a sample with
      // replacement is Poisson(frac). We use that to get a count for each element.
      val poisson = new PoissonDistribution(frac)
      poisson.reseedRandomGenerator(split.seed)

      firstParent[T].iterator(split.prev, context).flatMap { element =>
        val count = poisson.sample()
        if (count == 0) {
          Iterator.empty  // Avoid object allocation when we return 0 items, which is quite often
        } else {
          Iterator.fill(count)(element)
        }
      }
    } else { // Sampling without replacement
      val rand = new Random(split.seed)
      firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac))
    }
  }
} 
Example 141
Source File: WholeTextFileRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.hadoop.conf.{Configurable, Configuration}
import org.apache.hadoop.io.{Text, Writable}
import org.apache.hadoop.mapreduce.InputSplit

import org.apache.spark.{Partition, SparkContext}
import org.apache.spark.input.WholeTextFileInputFormat


private[spark] class WholeTextFileRDD(
    sc : SparkContext,
    inputFormatClass: Class[_ <: WholeTextFileInputFormat],
    keyClass: Class[Text],
    valueClass: Class[Text],
    conf: Configuration,
    minPartitions: Int)
  extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) {

  override def getPartitions: Array[Partition] = {
    val inputFormat = inputFormatClass.newInstance
    val conf = getConf
    inputFormat match {
      case configurable: Configurable =>
        configurable.setConf(conf)
      case _ =>
    }
    val jobContext = newJobContext(conf, jobId)
    inputFormat.setMinPartitions(jobContext, minPartitions)
    val rawSplits = inputFormat.getSplits(jobContext).toArray
    val result = new Array[Partition](rawSplits.size)
    for (i <- 0 until rawSplits.size) {
      result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])
    }
    result
  }
} 
Example 142
Source File: ZippedWithIndexRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.Utils

private[spark]
class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


  @transient private val startIndices: Array[Long] = {
    val n = prev.partitions.length
    if (n == 0) {
      Array[Long]()
    } else if (n == 1) {
      Array(0L)
    } else {
      prev.context.runJob(
        prev,
        Utils.getIteratorSize _,
        0 until n - 1 // do not need to count the last partition
      ).scanLeft(0L)(_ + _)
    }
  }

  override def getPartitions: Array[Partition] = {
    firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index)))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = {
    val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition]
    firstParent[T].iterator(split.prev, context).zipWithIndex.map { x =>
      (x._1, split.startIndex + x._2)
    }
  }
} 
Example 143
Source File: MemoryCheckpointRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.spark.storage.RDDBlockId
import org.apache.spark.{Partition, SparkContext, SparkException, TaskContext}

import scala.reflect.ClassTag

// We use a different class than LocalCheckpointRDD, but the same functionality,
// so that we easily identify (e..g, pattern match) this class in DAGScheduler.
class MemoryCheckpointRDD[T: ClassTag](sc: SparkContext, rddId: Int, numPartitions: Int)
  extends LocalCheckpointRDD[T](sc, rddId, numPartitions) {

  def this(rdd: RDD[T]) {
    this(rdd.context, rdd.id, rdd.partitions.size)
  }

  
  override def compute(partition: Partition, context: TaskContext): Iterator[T] = {
    throw new SparkException(
      s"Checkpoint block ${RDDBlockId(rddId, partition.index)} not found! Either the executor " +
        s"that originally checkpointed this partition is no longer alive, or the original RDD is " +
        s"unpersisted. If this problem persists, you may consider using `rdd.checkpoint()` " +
        s"or `rdd.localcheckpoint()` instead, which are slower than memory checkpointing but more fault-tolerant.")
  }
} 
Example 144
Source File: UnionRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.util.Utils


private[spark] class UnionPartition[T: ClassTag](
    idx: Int,
    @transient private val rdd: RDD[T],
    val parentRddIndex: Int,
    @transient private val parentRddPartitionIndex: Int)
  extends Partition {

  var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex)

  def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition)

  override val index: Int = idx

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    // Update the reference to parent split at the time of task serialization
    parentPartition = rdd.partitions(parentRddPartitionIndex)
    oos.defaultWriteObject()
  }
}

@DeveloperApi
class UnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]])
  extends RDD[T](sc, Nil) {  // Nil since we implement getDependencies

  override def getPartitions: Array[Partition] = {
    val array = new Array[Partition](rdds.map(_.partitions.length).sum)
    var pos = 0
    for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) {
      array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index)
      pos += 1
    }
    array
  }

  override def getDependencies: Seq[Dependency[_]] = {
    val deps = new ArrayBuffer[Dependency[_]]
    var pos = 0
    for (rdd <- rdds) {
      deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length)
      pos += rdd.partitions.length
    }
    deps
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val part = s.asInstanceOf[UnionPartition[T]]
    parent[T](part.parentRddIndex).iterator(part.parentPartition, context)
  }

  override def getPreferredLocations(s: Partition): Seq[String] =
    s.asInstanceOf[UnionPartition[T]].preferredLocations()

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }
} 
Example 145
Source File: PartitionwiseSampledRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.util.Random

import scala.reflect.ClassTag

import org.apache.spark.{Partition, TaskContext}
import org.apache.spark.util.random.RandomSampler
import org.apache.spark.util.Utils

private[spark]
class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long)
  extends Partition with Serializable {
  override val index: Int = prev.index
}


private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag](
    prev: RDD[T],
    sampler: RandomSampler[T, U],
    preservesPartitioning: Boolean,
    @transient private val seed: Long = Utils.random.nextLong)
  extends RDD[U](prev) {

  @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None

  override def getPartitions: Array[Partition] = {
    val random = new Random(seed)
    firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong()))
  }

  override def getPreferredLocations(split: Partition): Seq[String] =
    firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev)

  override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = {
    val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition]
    val thisSampler = sampler.clone
    thisSampler.setSeed(split.seed)
    thisSampler.sample(firstParent[T].iterator(split.prev, context))
  }
} 
Example 146
Source File: PartitionerAwareUnionRDD.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import java.io.{IOException, ObjectOutputStream}

import scala.reflect.ClassTag

import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext}
import org.apache.spark.util.Utils


private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
    sc: SparkContext,
    var rdds: Seq[RDD[T]]
  ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
  require(rdds.length > 0)
  require(rdds.forall(_.partitioner.isDefined))
  require(rdds.flatMap(_.partitioner).toSet.size == 1,
    "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))

  override val partitioner = rdds.head.partitioner

  override def getPartitions: Array[Partition] = {
    val numPartitions = partitioner.get.numPartitions
    (0 until numPartitions).map(index => {
      new PartitionerAwareUnionRDDPartition(rdds, index)
    }).toArray
  }

  // Get the location where most of the partitions of parent RDDs are located
  override def getPreferredLocations(s: Partition): Seq[String] = {
    logDebug("Finding preferred location for " + this + ", partition " + s.index)
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    val locations = rdds.zip(parentPartitions).flatMap {
      case (rdd, part) => {
        val parentLocations = currPrefLocs(rdd, part)
        logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
        parentLocations
      }
    }
    val location = if (locations.isEmpty) {
      None
    } else {
      // Find the location that maximum number of parent partitions prefer
      Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
    }
    logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
    location.toSeq
  }

  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
    val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
    rdds.zip(parentPartitions).iterator.flatMap {
      case (rdd, p) => rdd.iterator(p, context)
    }
  }

  override def clearDependencies() {
    super.clearDependencies()
    rdds = null
  }

  // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
  private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
    rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
  }
} 
Example 147
Source File: HazelcastRDD.scala    From hazelcast-spark   with Apache License 2.0 4 votes vote down vote up
package com.hazelcast.spark.connector.rdd

import com.hazelcast.client.HazelcastClientNotActiveException
import com.hazelcast.client.cache.impl.ClientCacheProxy
import com.hazelcast.client.proxy.ClientMapProxy
import com.hazelcast.core.{HazelcastInstance, Partition => HazelcastPartition}
import com.hazelcast.spark.connector.conf.SerializableConf
import com.hazelcast.spark.connector.iterator.{CacheIterator, MapIterator}
import com.hazelcast.spark.connector.util.ConnectionUtil.{closeHazelcastConnection, getHazelcastConnection}
import com.hazelcast.spark.connector.util.HazelcastUtil._
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, SparkContext, TaskContext}

import scala.collection.JavaConversions._
import scala.util.Try


class HazelcastRDD[K, V](@transient val sc: SparkContext, val hzName: String,
                         val isCache: Boolean, val config: SerializableConf) extends RDD[(K, V)](sc, Seq.empty) {

  @transient lazy val hazelcastPartitions: scala.collection.mutable.Map[Int, String] = {
    val client: HazelcastInstance = getHazelcastConnection(config.serverAddresses, id, config)
    val partitions: scala.collection.mutable.Map[Int, String] = scala.collection.mutable.Map[Int, String]()
    client.getPartitionService.getPartitions.foreach { p =>
      partitions.put(p.getPartitionId, p.getOwner.getAddress.getHost + ":" + p.getOwner.getAddress.getPort)
    }
    closeHazelcastConnection(config.serverAddresses, id)
    partitions
  }

  @DeveloperApi
  override def compute(split: Partition, context: TaskContext): Iterator[(K, V)] = {
    Try(computeInternal(split)).recover[Iterator[(K, V)]]({
      case e: HazelcastClientNotActiveException ⇒ computeInternal(split)
    }).get
  }

  def computeInternal(split: Partition): Iterator[(K, V)] = {
    val partitionLocationInfo = split.asInstanceOf[PartitionLocationInfo]
    val client: HazelcastInstance = getHazelcastConnection(partitionLocationInfo.location, id, config)
    if (isCache) {
      val cache: ClientCacheProxy[K, V] = getClientCacheProxy(hzName, client)
      new CacheIterator[K, V](cache.iterator(config.readBatchSize, split.index, config.valueBatchingEnabled))
    } else {
      val map: ClientMapProxy[K, V] = getClientMapProxy(hzName, client)
      new MapIterator[K, V](map.iterator(config.readBatchSize, split.index, config.valueBatchingEnabled))
    }
  }

  override protected def getPartitions: Array[Partition] = {
    var array: Array[Partition] = Array[Partition]()
    for (i <- 0 until hazelcastPartitions.size) {
      array = array :+ new PartitionLocationInfo(i, hazelcastPartitions.get(i).get)
    }
    array
  }

  override protected def getPreferredLocations(split: Partition): Seq[String] = {
    Seq(hazelcastPartitions.get(split.index).get)
  }

}