org.apache.spark.Partitioner.defaultPartitioner Scala Examples

The following examples show how to use org.apache.spark.Partitioner.defaultPartitioner. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: GroupSorted.scala    From spark-sorted   with Apache License 2.0 5 votes vote down vote up
package com.tresata.spark.sorted.api.java

import java.util.{ Comparator, Iterator => JIterator }
import scala.reflect.ClassTag
import scala.collection.JavaConverters._

import org.apache.spark.{ Partitioner, HashPartitioner }
import org.apache.spark.Partitioner.defaultPartitioner
import org.apache.spark.api.java.JavaPairRDD
import org.apache.spark.api.java.function.{ Function => JFunction, Function2 => JFunction2, FlatMapFunction => JFlatMapFunction }

import com.tresata.spark.sorted.{ GroupSorted => SGroupSorted }

object GroupSorted {
  private case class ComparatorOrdering[T](comparator: Comparator[T]) extends Ordering[T] {
    def compare(x: T, y: T) = comparator.compare(x, y)
  }

  private def comparatorToOrdering[T](comparator: Comparator[T]): Ordering[T] = new ComparatorOrdering(comparator)

  private def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]]

  private implicit def ordering[K]: Ordering[K] = comparatorToOrdering(NaturalComparator.get[K])

  private def groupSort[K, V](javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]): SGroupSorted[K, V] = {
    implicit def kClassTag: ClassTag[K] = javaPairRDD.kClassTag
    implicit def vClassTag: ClassTag[V] = javaPairRDD.vClassTag
    val valueOrdering = Option(valueComparator).map(comparatorToOrdering)
    SGroupSorted(javaPairRDD.rdd, partitioner, valueOrdering)
  }
}

class GroupSorted[K, V] private (sGroupSorted: SGroupSorted[K, V]) extends JavaPairRDD[K, V](sGroupSorted)(GroupSorted.fakeClassTag[K], GroupSorted.fakeClassTag[V]) {
  def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]) =
    this(GroupSorted.groupSort(javaPairRDD, partitioner, valueComparator))

  def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner) =
    this(GroupSorted.groupSort(javaPairRDD, partitioner, null))

  def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int, valueComparator: Comparator[V]) =
    this(javaPairRDD, if (numPartitions > 0) new HashPartitioner(numPartitions) else defaultPartitioner(javaPairRDD.rdd), valueComparator)

  def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int) =
    this(javaPairRDD, numPartitions, null)

  def this(javaPairRDD: JavaPairRDD[K, V], valueComparator: Comparator[V]) =
    this(javaPairRDD, -1, valueComparator)

  def this(javaPairRDD: JavaPairRDD[K, V]) = this(javaPairRDD, -1, null)

  import GroupSorted._

  override def flatMapValues[W](f: JFlatMapFunction[V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.flatMapValues(v => f.call(v).asScala))
  }

  override def mapValues[W](f: JFunction[V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapValues(v => f.call(v)))
  }

  def mapKeyValuesToValues[W](f: JFunction[Tuple2[K, V], W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapKeyValuesToValues(kv => f.call(kv)))
  }

  def mapStreamByKey[W](f: JFunction[JIterator[V], JIterator[W]]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapStreamByKey(it => f.call(it.asJava).asScala))
  }

  def foldLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.foldLeftByKey(w)((w, v) => f.call(w, v)))
  }

  def reduceLeftByKey[W >: V](f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.reduceLeftByKey(f.call))
  }

  def scanLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.scanLeftByKey(w)((w, v) => f.call(w, v)))
  }
} 
Example 2
Source File: BlockJoinOperations.scala    From spark-skewjoin   with Apache License 2.0 5 votes vote down vote up
package com.tresata.spark.skewjoin

import java.util.{ Random => JRandom }
import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.Partitioner
import org.apache.spark.Partitioner.defaultPartitioner

class BlockJoinOperations[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) extends Serializable {
  // based on blockJoinWithSmaller in scalding. See com.twitter.scalding.JoinAlgorithms
  private def blockCogroup[W](other: RDD[(K, W)], leftReplication: Int, rightReplication: Int, partitioner: Partitioner): RDD[((K, (Int, Int)), (Iterable[V], Iterable[W]))] = {
    assert(leftReplication >= 1, "must specify a positive number for left replication")
    assert(rightReplication >= 1, "must specify a positive number for right replication")
    def getReplication(random: JRandom, replication: Int, otherReplication: Int) : Seq[(Int, Int)] = {
      val rand = random.nextInt(otherReplication)
      (0 until replication).map{ rep => (rand, rep) }
    }
    val rddBlocked = rdd.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv =>
        getReplication(random, leftReplication, rightReplication).map{ rl => ((kv._1, rl.swap), kv._2)}
      }
    }
    val otherBlocked = other.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv =>
        getReplication(random, rightReplication, leftReplication).map{ lr => ((kv._1, lr), kv._2)}
      }
    }
    rddBlocked.cogroup(otherBlocked, partitioner)
  }
  
  
  def blockRightOuterJoin[W](other: RDD[(K, W)], leftReplication: Int): RDD[(K, (Option[V], W))] =
    blockRightOuterJoin(other, leftReplication, defaultPartitioner(rdd, other)) 
} 
Example 3
Source File: SkewJoinOperations.scala    From spark-skewjoin   with Apache License 2.0 5 votes vote down vote up
package com.tresata.spark.skewjoin

import java.util.{ Random => JRandom }
import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.Partitioner
import org.apache.spark.Partitioner.defaultPartitioner

import com.twitter.algebird.{ CMS, CMSHasher, CMSMonoid }

case class CMSParams(eps: Double = 0.005, delta: Double = 1e-8, seed: Int = 1) {
  def getCMSMonoid[K: Ordering: CMSHasher]: CMSMonoid[K] = CMS.monoid[K](eps, delta, seed)
}

class SkewJoinOperations[K: ClassTag: Ordering: CMSHasher, V: ClassTag](rdd: RDD[(K, V)]) extends Serializable {
  private def getReplicationFactors(random: JRandom, replication: Int, otherReplication: Int): Seq[(Int, Int)] = {
    require(replication > 0 && otherReplication > 0, "replication must be positive")
    val rand = random.nextInt(otherReplication)
    (0 until replication).map(rep => (rand, rep))
  }
  
  private def createRddCMS[K](rdd: RDD[K], cmsMonoid: CMSMonoid[K]): CMS[K] =
    rdd.map(k => cmsMonoid.create(k)).reduce(cmsMonoid.plus(_, _))

  def skewCogroup[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (Iterable[V], Iterable[W]))] = {
    val numPartitions = partitioner.numPartitions
    val broadcastedLeftCMS = rdd.sparkContext.broadcast(createRddCMS[K](rdd.keys, cmsParams.getCMSMonoid[K]))
    val broadcastedRightCMS = rdd.sparkContext.broadcast(createRddCMS[K](other.keys, cmsParams.getCMSMonoid[K]))
    
    val rddSkewed = rdd.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv => 
        val (leftReplication, rightReplication) = skewReplication.getReplications(
          broadcastedLeftCMS.value.frequency(kv._1).estimate,
          broadcastedRightCMS.value.frequency(kv._1).estimate,
          numPartitions)
        getReplicationFactors(random, leftReplication, rightReplication).map(rl =>((kv._1, rl.swap), kv._2))
      }
    }
    
    val otherSkewed = other.mapPartitions{ it =>
      val random = new JRandom
      it.flatMap{ kv => 
        val (leftReplication, rightReplication) = skewReplication.getReplications(
          broadcastedLeftCMS.value.frequency(kv._1).estimate,
          broadcastedRightCMS.value.frequency(kv._1).estimate,
          numPartitions)
        getReplicationFactors(random, rightReplication, leftReplication).map(lr => ((kv._1, lr), kv._2))
      }
    }

    rddSkewed.cogroup(otherSkewed, partitioner).map(kv => (kv._1._1, kv._2))
  }

  def skewCogroup[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (Iterable[V], Iterable[W]))] =
    skewCogroup(other, defaultPartitioner(rdd, other))

  def skewJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (V, W))] =
    skewCogroup(other, partitioner, skewReplication, cmsParams).flatMap{ blockPair =>
      for (v <- blockPair._2._1.iterator; w <- blockPair._2._2.iterator) yield
        (blockPair._1, (v, w))
    }

  def skewJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (V, W))] =
    skewJoin(other, defaultPartitioner(rdd, other))

  def skewLeftOuterJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (V, Option[W]))] =
    skewCogroup(other, partitioner, RightReplication(skewReplication), cmsParams).flatMap{
      case (k, (itv, Seq())) => itv.iterator.map(v => (k, (v, None)))
      case (k, (itv, itw)) => for (v <- itv; w <- itw) yield (k, (v, Some(w)))
    }

  def skewLeftOuterJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (V, Option[W]))] =
    skewLeftOuterJoin(other, defaultPartitioner(rdd, other))

  def skewRightOuterJoin[W: ClassTag](other: RDD[(K, W)], partitioner: Partitioner,
    skewReplication: SkewReplication = DefaultSkewReplication(), cmsParams: CMSParams = CMSParams()): RDD[(K, (Option[V], W))] =
    skewCogroup(other, partitioner, LeftReplication(skewReplication), cmsParams).flatMap{
      case (k, (Seq(), itw)) => itw.iterator.map(w => (k, (None, w)))
      case (k, (itv, itw)) => for (v <- itv; w <- itw) yield (k, (Some(v), w))
    }
  
  def skewRightOuterJoin[W: ClassTag](other: RDD[(K, W)]): RDD[(K, (Option[V], W))] =
    skewRightOuterJoin(other, defaultPartitioner(rdd, other))
}

trait Dsl {
  implicit def rddToSkewJoinOperations_e94qoy3tnt[K: ClassTag: Ordering: CMSHasher, V: ClassTag](rdd: RDD[(K, V)]): SkewJoinOperations[K, V] = new SkewJoinOperations(rdd)
  implicit def rddToBlockJoinOperations_7IaIe6dkih[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]): BlockJoinOperations[K, V] = new BlockJoinOperations(rdd)
}

object Dsl extends Dsl 
Example 4
Source File: SkewJoinOperationsSpec.scala    From spark-skewjoin   with Apache License 2.0 5 votes vote down vote up
package com.tresata.spark.skewjoin

import org.scalatest.FunSpec

import com.tresata.spark.skewjoin.Dsl._

import com.twitter.algebird.CMSHasherImplicits._

import org.apache.spark.Partitioner.defaultPartitioner

case object DummySkewReplication extends SkewReplication {
  override def getReplications(leftCount: Long, rightCount: Long, numPartitions: Int) = (2, 2)
}

class SkewJoinOperationsSpec extends FunSpec {
  lazy val sc = SparkSuite.sc
  lazy val rdd1 = sc.parallelize(Array(1, 1, 2, 3, 4)).map(s => (s, 1)).repartition(2)
  lazy val rdd2 = sc.parallelize(Array(1, 1, 6, 4, 5)).map(s => (s, 2)).repartition(2)

  describe("SkewJoin") {
    it("should inner join two datasets using skewJoin correctly") {
      assert(rdd1.skewJoin(rdd2, defaultPartitioner(rdd1, rdd2), DefaultSkewReplication(1)).sortByKey(true).collect.toList === 
        Seq((1, (1, 2)), (1, (1, 2)), (1, (1, 2)), (1, (1, 2)), (4, (1, 2))))
    }

    it("should left join two datasets using skewLeftOuterJoin correctly") {
      assert(rdd1.skewLeftOuterJoin(rdd2, defaultPartitioner(rdd1, rdd2), DefaultSkewReplication(1)).sortByKey(true).collect.toList ===
        Seq((1, (1, Some(2))), (1, (1, Some(2))), (1, (1, Some(2))), (1, (1, Some(2))), (2, (1, None)), (3, (1, None)), (4, (1, Some(2)))))
    }

    it("should right join two datasets using skewRightOuterJoin correctly") {
      assert(rdd1.skewRightOuterJoin(rdd2, defaultPartitioner(rdd1, rdd2), DefaultSkewReplication(1)).sortByKey(true).collect.toList ===
        Seq((1, (Some(1), 2)), (1, (Some(1), 2)), (1, (Some(1), 2)), (1, (Some(1), 2)), (4, (Some(1), 2)), (5, (None, 2)), (6, (None, 2))))
    }
  }
}