org.apache.spark.util.collection.CompactBuffer Scala Examples

The following examples show how to use org.apache.spark.util.collection.CompactBuffer. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: VOrderedRDDFunctions.scala    From spark-vlbfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import org.apache.spark.Partitioner
import org.apache.spark.internal.Logging
import org.apache.spark.util.collection.CompactBuffer

import scala.reflect.ClassTag

class VOrderedRDDFunctions[K, V](self: RDD[(K, V)])
    (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K])
  extends Logging with Serializable {

  def groupByKeyUsingSort(partitioner: Partitioner): RDD[(K, Iterable[V])] = {
    self.repartitionAndSortWithinPartitions(partitioner)
      .mapPartitions { (iter: Iterator[(K, V)]) =>
        new Iterator[(K, CompactBuffer[V])] {
          private var firstElemInNextGroup: (K, V) = null

          override def hasNext: Boolean = firstElemInNextGroup != null || iter.hasNext

          override def next(): (K, CompactBuffer[V]) = {
            if (firstElemInNextGroup == null) {
              firstElemInNextGroup = iter.next()
            }
            val key = firstElemInNextGroup._1
            val group = CompactBuffer[V](firstElemInNextGroup._2)
            firstElemInNextGroup = null
            var reachNewGroup = false
            while (iter.hasNext && !reachNewGroup) {
              val currElem = iter.next()
              if (currElem._1 == key) {
                group += currElem._2
              } else {
                firstElemInNextGroup = currElem
                reachNewGroup = true
              }
            }
            (key, group)
          }
        }
      }
  }
}

private[spark] object VOrderedRDDFunctions {

  implicit def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)])(implicit ord: Ordering[K]):
      VOrderedRDDFunctions[K, V] = {
    new VOrderedRDDFunctions(rdd)
  }
} 
Example 2
Source File: HashJoin.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.joins

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.util.collection.CompactBuffer


trait HashJoin {
  self: SparkPlan =>

  val leftKeys: Seq[Expression]
  val rightKeys: Seq[Expression]
  val buildSide: BuildSide
  val left: SparkPlan
  val right: SparkPlan

  protected lazy val (buildPlan, streamedPlan) = buildSide match {
    case BuildLeft => (left, right)
    case BuildRight => (right, left)
  }

  protected lazy val (buildKeys, streamedKeys) = buildSide match {
    case BuildLeft => (leftKeys, rightKeys)
    case BuildRight => (rightKeys, leftKeys)
  }

  override def output: Seq[Attribute] = left.output ++ right.output

  @transient protected lazy val buildSideKeyGenerator: Projection =
    newProjection(buildKeys, buildPlan.output)

  @transient protected lazy val streamSideKeyGenerator: () => MutableProjection =
    newMutableProjection(streamedKeys, streamedPlan.output)

  protected def hashJoin(streamIter: Iterator[Row], hashedRelation: HashedRelation): Iterator[Row] =
  {
    new Iterator[Row] {
      private[this] var currentStreamedRow: Row = _
      private[this] var currentHashMatches: CompactBuffer[Row] = _
      private[this] var currentMatchPosition: Int = -1

      // Mutable per row objects.
      private[this] val joinRow = new JoinedRow2

      private[this] val joinKeys = streamSideKeyGenerator()

      override final def hasNext: Boolean =
        (currentMatchPosition != -1 && currentMatchPosition < currentHashMatches.size) ||
          (streamIter.hasNext && fetchNext())

      override final def next(): Row = {
        val ret = buildSide match {
          case BuildRight => joinRow(currentStreamedRow, currentHashMatches(currentMatchPosition))
          case BuildLeft => joinRow(currentHashMatches(currentMatchPosition), currentStreamedRow)
        }
        currentMatchPosition += 1
        ret
      }

      
      private final def fetchNext(): Boolean = {
        currentHashMatches = null
        currentMatchPosition = -1

        while (currentHashMatches == null && streamIter.hasNext) {
          currentStreamedRow = streamIter.next()
          if (!joinKeys(currentStreamedRow).anyNull) {
            currentHashMatches = hashedRelation.get(joinKeys.currentValue)
          }
        }

        if (currentHashMatches == null) {
          false
        } else {
          currentMatchPosition = 0
          true
        }
      }
    }
  }
} 
Example 3
Source File: HashedRelationSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.joins

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.expressions.{Projection, Row}
import org.apache.spark.util.collection.CompactBuffer


class HashedRelationSuite extends SparkFunSuite {

  // Key is simply the record itself
  private val keyProjection = new Projection {
    override def apply(row: Row): Row = row
  }

  test("GeneralHashedRelation") {
    val data = Array(Row(0), Row(1), Row(2), Row(2))
    val hashed = HashedRelation(data.iterator, keyProjection)
    assert(hashed.isInstanceOf[GeneralHashedRelation])

    assert(hashed.get(data(0)) == CompactBuffer[Row](data(0)))
    assert(hashed.get(data(1)) == CompactBuffer[Row](data(1)))
    assert(hashed.get(Row(10)) === null)

    val data2 = CompactBuffer[Row](data(2))
    data2 += data(2)
    assert(hashed.get(data(2)) == data2)
  }

  test("UniqueKeyHashedRelation") {
    val data = Array(Row(0), Row(1), Row(2))
    val hashed = HashedRelation(data.iterator, keyProjection)
    assert(hashed.isInstanceOf[UniqueKeyHashedRelation])

    assert(hashed.get(data(0)) == CompactBuffer[Row](data(0)))
    assert(hashed.get(data(1)) == CompactBuffer[Row](data(1)))
    assert(hashed.get(data(2)) == CompactBuffer[Row](data(2)))
    assert(hashed.get(Row(10)) === null)

    val uniqHashed = hashed.asInstanceOf[UniqueKeyHashedRelation]
    assert(uniqHashed.getValue(data(0)) == data(0))
    assert(uniqHashed.getValue(data(1)) == data(1))
    assert(uniqHashed.getValue(data(2)) == data(2))
    assert(uniqHashed.getValue(Row(10)) == null)
  }
}