org.apache.spark.NarrowDependency Scala Examples

The following examples show how to use org.apache.spark.NarrowDependency. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: LeftJoin.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd.function.join

import com.twosigma.flint.rdd.{ PartitionsIterator, PeekableIterator }
import org.apache.spark.{ NarrowDependency, OneToOneDependency }
import com.twosigma.flint.rdd.OrderedRDD

import scala.collection.immutable.TreeMap
import scala.reflect.ClassTag
import java.util.{ HashMap => JHashMap }

protected[flint] object LeftJoin {

  val skMapInitialSize = 1024

  def apply[K: ClassTag, SK, V, V2](
    leftRdd: OrderedRDD[K, V],
    rightRdd: OrderedRDD[K, V2],
    toleranceFn: K => K,
    leftSk: V => SK,
    rightSk: V2 => SK
  )(implicit ord: Ordering[K]): OrderedRDD[K, (V, Option[(K, V2)])] = {
    // A map from left partition index to left range split and right partitions.
    val leftIndexToJoinSplits = TreeMap(RangeMergeJoin.leftJoinSplits(
      toleranceFn, leftRdd.rangeSplits, rightRdd.rangeSplits
    ).map { case (split, parts) => (split.partition.index, (split, parts)) }: _*)

    val leftDep = new OneToOneDependency(leftRdd)
    val rightDep = new NarrowDependency(rightRdd) {
      override def getParents(partitionId: Int) =
        leftIndexToJoinSplits(partitionId)._2.map(_.index)
    }

    // A map from left partition index to right partitions
    val rightPartitions = leftRdd.sc.broadcast(leftIndexToJoinSplits.map {
      case (idx, joinSplit) => (idx, joinSplit._2)
    })

    val joinedSplits = leftIndexToJoinSplits.map { case (_, (split, _)) => split }.toArray

    // We don't need the left dependency as we will just load it on demand here
    new OrderedRDD[K, (V, Option[(K, V2)])](leftRdd.sc, joinedSplits, Seq(leftDep, rightDep))(
      (part, context) => {
        val parts = rightPartitions.value(part.index)
        val rightIter = PeekableIterator(PartitionsIterator(rightRdd, parts, context))
        val lastSeen = new JHashMap[SK, (K, V2)](skMapInitialSize)
        leftRdd.iterator(part, context).map {
          case (k, v) =>
            // Catch-up the iterator for the right table to match the left key. In the
            // process, we'll have the last-seen row for each SK in the right table.
            val sk = leftSk(v)
            catchUp(k, rightSk, rightIter, lastSeen)
            val lastSeenRight = lastSeen.get(sk)
            if (lastSeenRight != null && ord.gteq(lastSeenRight._1, toleranceFn(k))) {
              (k, (v, Some(lastSeenRight)))
            } else {
              (k, (v, None))
            }
        }
      }
    )
  }

  
  @annotation.tailrec
  private[rdd] def catchUp[K, SK, V](
    cur: K,
    skFn: V => SK,
    iter: PeekableIterator[(K, V)],
    lastSeen: JHashMap[SK, (K, V)]
  )(implicit ord: Ordering[K]) {
    val peek = iter.peek
    if (peek.nonEmpty && ord.lteq(peek.get._1, cur)) {
      val (k, v) = iter.next
      val sk = skFn(v)
      lastSeen.put(sk, (k, v))
      catchUp(cur, skFn, iter, lastSeen)
    }
  }

} 
Example 2
Source File: Merge.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd.function.join

import com.twosigma.flint.rdd.{ PeekableIterator, PartitionsIterator, MergeIterator, RangeSplit }
import org.apache.spark.NarrowDependency
import com.twosigma.flint.rdd._

import scala.reflect.ClassTag

protected[flint] object Merge {

  def apply[K: Ordering: ClassTag, V: ClassTag](
    left: OrderedRDD[K, V],
    right: OrderedRDD[K, V]
  ): OrderedRDD[K, V] = ++(left, right).mapValues {
    case (_, Left(v)) => v
    case (_, Right(v)) => v
  }

  def ++[K: ClassTag, V: ClassTag, V2: ClassTag](
    left: OrderedRDD[K, V],
    right: OrderedRDD[K, V2]
  )(
    implicit
    ord: Ordering[K]
  ): OrderedRDD[K, Either[V, V2]] = {
    // A map from new partition to a RangeMergeJoin.
    val partToMergeJoin = RangeMergeJoin.mergeSplits(left.rangeSplits, right.rangeSplits).zipWithIndex.map {
      case (mergeJoin, idx) => (OrderedRDDPartition(idx), mergeJoin)
    }.toMap

    // A map from partition index to a RangeMergeJoin.
    val partitionIndexToMergeJoin = partToMergeJoin.map { case (p, m) => (p.index, m) }
    val leftDep = new NarrowDependency(left) {
      override def getParents(partitionId: Int) =
        partitionIndexToMergeJoin(partitionId).left.map(_.partition.index)
    }
    val rightDep = new NarrowDependency(right) {
      override def getParents(partitionId: Int) =
        partitionIndexToMergeJoin(partitionId).right.map(_.partition.index)
    }
    val mergedSplits = partToMergeJoin.map {
      case (p, mergeJoin) => RangeSplit(p, mergeJoin.range)
    }.toArray

    new OrderedRDD[K, Either[V, V2]](left.sc, mergedSplits, Seq(leftDep, rightDep))(
      (part, context) => {
        val mergedJoin = partitionIndexToMergeJoin(part.index)
        // Select rows from both RDDs whose key belongs to this RangeMergeJoin's range
        val leftParts = mergedJoin.left.map(_.partition)
        val leftIter = PeekableIterator(PartitionsIterator(left, leftParts, context).filter {
          case (k, _) => mergedJoin.range.contains(k)
        })
        val rightParts = mergedJoin.right.map(_.partition)
        val rightIter = PeekableIterator(PartitionsIterator(right, rightParts, context).filter {
          case (k, _) => mergedJoin.range.contains(k)
        })
        // Perform an ordered merge of the selected rows.
        MergeIterator(leftIter, rightIter)
      }
    )
  }

} 
Example 3
Source File: ReorderedPartitionsRDD.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.sparkextras

import is.hail.utils.FastSeq
import org.apache.spark.rdd.RDD
import org.apache.spark.{Dependency, NarrowDependency, Partition, TaskContext}

import scala.reflect.ClassTag

case class ReorderedPartitionsRDDPartition(index: Int, oldPartition: Partition) extends Partition

class ReorderedPartitionsRDD[T](@transient var prev: RDD[T], @transient val oldIndices: Array[Int])(implicit tct: ClassTag[T])
  extends RDD[T](prev.sparkContext, Nil) {

  override def getPartitions: Array[Partition] = {
    val parentPartitions = dependencies.head.rdd.asInstanceOf[RDD[T]].partitions
    Array.tabulate(oldIndices.length) { i =>
      val oldIndex = oldIndices(i)
      val oldPartition = parentPartitions(oldIndex)
      ReorderedPartitionsRDDPartition(i, oldPartition)
    }
  }

  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
    val parent = dependencies.head.rdd.asInstanceOf[RDD[T]]
    parent.compute(split.asInstanceOf[ReorderedPartitionsRDDPartition].oldPartition, context)
  }

  override def getDependencies: Seq[Dependency[_]] = FastSeq(new NarrowDependency[T](prev) {
    override def getParents(partitionId: Int): Seq[Int] = FastSeq(oldIndices(partitionId))
  })

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }

  override def getPreferredLocations(partition: Partition): Seq[String] =
    prev.preferredLocations(partition.asInstanceOf[ReorderedPartitionsRDDPartition].oldPartition)
} 
Example 4
Source File: BlockedRDD.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.sparkextras

import is.hail.utils._
import org.apache.spark.rdd.RDD
import org.apache.spark.{Dependency, NarrowDependency, Partition, TaskContext}

import scala.language.existentials
import scala.reflect.ClassTag

case class BlockedRDDPartition(@transient rdd: RDD[_],
  index: Int,
  first: Int,
  last: Int) extends Partition {
  require(first <= last)

  val parentPartitions: Array[Partition] = range.map(rdd.partitions).toArray

  def range: Range = first to last
}

class BlockedRDD[T](@transient var prev: RDD[T],
  @transient val partFirst: Array[Int],
  @transient val partLast: Array[Int]
)(implicit tct: ClassTag[T]) extends RDD[T](prev.sparkContext, Nil) {
  assert(partFirst.length == partLast.length)

  override def getPartitions: Array[Partition] = {
    Array.tabulate[Partition](partFirst.length)(i =>
      BlockedRDDPartition(prev, i, partFirst(i), partLast(i)))
  }

  override def compute(split: Partition, context: TaskContext): Iterator[T] = {
    val parent = dependencies.head.rdd.asInstanceOf[RDD[T]]
    split.asInstanceOf[BlockedRDDPartition].parentPartitions.iterator.flatMap(p =>
      parent.iterator(p, context))
  }

  override def getDependencies: Seq[Dependency[_]] = {
    FastSeq(new NarrowDependency(prev) {
      def getParents(id: Int): Seq[Int] =
        partitions(id).asInstanceOf[BlockedRDDPartition].range
    })
  }

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }

  override def getPreferredLocations(partition: Partition): Seq[String] = {
    val prevPartitions = prev.partitions
    val range = partition.asInstanceOf[BlockedRDDPartition].range

    val locationAvail = range.flatMap(i =>
      prev.preferredLocations(prevPartitions(i)))
      .groupBy(identity)
      .mapValues(_.length)

    if (locationAvail.isEmpty)
      return FastSeq.empty[String]

    val m = locationAvail.values.max
    locationAvail.filter(_._2 == m)
      .keys
      .toFastSeq
  }
} 
Example 5
Source File: InsertRDD.scala    From spark-vector   with Apache License 2.0 5 votes vote down vote up
package com.actian.spark_vector.datastream.writer

import scala.annotation.tailrec
import scala.reflect.ClassTag

import org.apache.spark.{ OneToOneDependency, NarrowDependency, Partition, TaskContext }
import org.apache.spark.rdd.RDD

import com.actian.spark_vector.datastream.{ DataStreamPartition, DataStreamPartitionAssignment, VectorEndpointConf }


  private val endPointsToParentPartitionsMap = {
    val affinities = rdd.partitions.map(getPreferredLocationsRec(rdd, _))

    val ret = DataStreamPartitionAssignment(affinities, writeConf.vectorEndpoints)
    logDebug(s"Computed endPointsToParentPartitionsMap and got: ${
      (0 until ret.length).map {
        case idx =>
          val vals = ret(idx)
          s"Datastream $idx -> RDD partitions ${vals.length}: ${vals.take(partitionsPerDataStreamToPrint).mkString(",")} ${if (vals.length > partitionsPerDataStreamToPrint) "..." else ""}"
      }
    }")
    ret.map(_.map(rdd.partitions(_).index))
  }

  override protected def getPartitions = (0 until writeConf.size).map(idx =>
    DataStreamPartition(idx, rdd, endPointsToParentPartitionsMap(idx))).toArray

  override protected def getPreferredLocations(split: Partition) = {
    logDebug(s"getPreferredLocations is called for partition ${split.index} and we are returning ${writeConf.vectorEndpoints(split.index).host}")
    Seq(writeConf.vectorEndpoints(split.index).host)
  }

  override def compute(split: Partition, taskContext: TaskContext): Iterator[R] =
    split.asInstanceOf[DataStreamPartition].parents.toIterator.flatMap(firstParent[R].iterator(_, taskContext))

  override def getDependencies: Seq[NarrowDependency[_]] = Seq(new NarrowDependency(rdd) {
    def getParents(partitionId: Int) = endPointsToParentPartitionsMap(partitionId)
  })
}