org.apache.spark.RangePartitioner Scala Examples

The following examples show how to use org.apache.spark.RangePartitioner. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: OrderedRDDFunctions.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) =>
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Example 2

Source File: CCDriver.scala From connected-component with MIT License

5 votes

package com.kwartile.lib.cc

import org.apache.spark.{RangePartitioner, SparkConf, SparkContext}


      val rangePartitioner = new RangePartitioner(cc2.getNumPartitions, cc2)
      val connectedComponents =  cc2.reduceByKey(rangePartitioner, (a, b) => {b ::: a})

      //connectedComponents.mapPartitionsWithIndex((index, iter) => {
      //  iter.toList.map(x => (index, x._1, x._2.size)).iterator
      //  }).collect.foreach(println)

      println("connected components")
      connectedComponents.map(x => (x._2.length).toString + " " + x._1 + " " + x._2.sorted.mkString(" ")).saveAsTextFile(cliqueFile + "_cc_out")
    }
    else {
      println("Max iteration reached.  Could not converge")
    }
  }
}

Example 3

Source File: UsePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_5

import com.tomekl007.UserTransaction
import org.apache.spark.{HashPartitioner, RangePartitioner, SparkContext}
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class UsePartitioner extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should use different partitioners") {
    //given
    val keysWithValuesList =
      Array(
        UserTransaction("A", 100),
        UserTransaction("B", 4),
        UserTransaction("A", 100001),
        UserTransaction("B", 10),
        UserTransaction("C", 10)
      )
    val data = spark.parallelize(keysWithValuesList)
    val keyed = data.keyBy(_.userId)

    //when, then
    val partitioner = keyed.partitioner
    assert(partitioner.isEmpty)

    val hashPartitioner = keyed.partitionBy(new HashPartitioner(100))
    println(hashPartitioner)
    assert(hashPartitioner.partitioner.isDefined)

    val rangePartitioner = keyed.partitionBy(new RangePartitioner(100, keyed))
    println(rangePartitioner)
    assert(rangePartitioner.partitioner.isDefined)

  }
}

Example 4

Source File: CustomRangePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License

5 votes

package com.tomekl007.chapter_5

import com.tomekl007.UserTransaction
import org.apache.spark.sql.SparkSession
import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkContext}
import org.scalatest.FunSuite

class CustomRangePartitionerTest extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should use custom range partitioner") {
    //given
    val keysWithValuesList =
      Array(
        UserTransaction("A", 100),
        UserTransaction("B", 4),
        UserTransaction("A", 100001),
        UserTransaction("B", 10),
        UserTransaction("C", 10)
      )
    val data = spark.parallelize(keysWithValuesList)
    val keyed = data.keyBy(_.amount)

    //when, then
    val partitioned = keyed.partitionBy(new CustomRangePartitioner(List((0,100), (100, 10000), (10000, 1000000))))

    //then
    partitioned.collect().toList
  }
}

class CustomRangePartitioner(ranges: List[(Int,Int)]) extends Partitioner{
  override def numPartitions: Int = ranges.size

  override def getPartition(key: Any): Int = {
    if(!key.isInstanceOf[Int]){
      throw new IllegalArgumentException("partitioner works only for Int type")
    }
    val keyInt = key.asInstanceOf[Int]
    val index = ranges.lastIndexWhere(v => keyInt >= v._1 && keyInt <= v._2)
    println(s"for key: $key return $index")
    index
  }
}

Example 5

Source File: OrderedRDDFunctions.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) =>
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Example 6

Source File: StratifiedRepartition.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.contracts.{HasLabelCol, Wrappable}
import org.apache.spark.RangePartitioner
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared.HasSeed
import org.apache.spark.ml.util._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}


  override def transform(dataset: Dataset[_]): DataFrame = {
    // Count unique values in label column
    val distinctLabelCounts = dataset.select(getLabelCol).groupBy(getLabelCol).count().collect()
    val labelToCount = distinctLabelCounts.map(row => (row.getInt(0), row.getLong(1)))
    val labelToFraction =
      getMode match {
        case SPConstants.Equal => getEqualLabelCount(labelToCount, dataset)
        case SPConstants.Mixed =>
          val equalLabelToCount = getEqualLabelCount(labelToCount, dataset)
          val normalizedRatio = equalLabelToCount.map { case (label, count) => count }.sum / labelToCount.length
          labelToCount.map { case (label, count) => (label, count / normalizedRatio)}.toMap
        case SPConstants.Original => labelToCount.map { case (label, count) => (label, 1.0) }.toMap
        case _ => throw new Exception(s"Unknown mode specified to StratifiedRepartition: $getMode")
      }
    val labelColIndex = dataset.schema.fieldIndex(getLabelCol)
    val spdata = dataset.toDF().rdd.keyBy(row => row.getInt(labelColIndex))
      .sampleByKeyExact(true, labelToFraction, getSeed)
      .mapPartitions(keyToRow => keyToRow.zipWithIndex.map { case ((key, row), index) => (index, row) })
    val rangePartitioner = new RangePartitioner(dataset.rdd.getNumPartitions, spdata)
    val rspdata = spdata.partitionBy(rangePartitioner).mapPartitions(keyToRow =>
      keyToRow.map{case (key, row) => row}).persist()
    dataset.sqlContext.createDataFrame(rspdata, dataset.schema)
  }

  private def getEqualLabelCount(labelToCount: Array[(Int, Long)], dataset: Dataset[_]): Map[Int, Double] = {
    val maxLabelCount = Math.max(labelToCount.map { case (label, count) => count }.max, dataset.rdd.getNumPartitions)
    labelToCount.map { case (label, count) => (label, maxLabelCount.toDouble / count) }.toMap
  }

  def transformSchema(schema: StructType): StructType = schema

  def copy(extra: ParamMap): DropColumns = defaultCopy(extra)
}

Example 7

Source File: OrderedRDDFunctions.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) =>
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Example 8

Source File: OrderedRDDFunctions.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Logging, Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) => {
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      }
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Example 9

Source File: OrderedRDDFunctions.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) =>
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Example 10

Source File: OrderedRDDFunctions.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Logging, Partitioner, RangePartitioner}
import org.apache.spark.annotation.DeveloperApi


  def filterByRange(lower: K, upper: K): RDD[P] = self.withScope {

    def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper)

    val rddToFilter: RDD[P] = self.partitioner match {
      case Some(rp: RangePartitioner[K, V]) => {
        val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match {
          case (l, u) => Math.min(l, u) to Math.max(l, u)
        }
        PartitionPruningRDD.create(self, partitionIndicies.contains)
      }
      case _ =>
        self
    }
    rddToFilter.filter { case (k, v) => inRange(k) }
  }

}

Example 11

Source File: PartitionBy.scala From learning-spark with Apache License 2.0

5 votes

package com.javachen.spark.examples.rdd

import org.apache.spark.{RangePartitioner,HashPartitioner, SparkContext}

object PartitionBy {
  def main(args: Array[String]) {

    val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
    val data1 = Array[(String, Int)](("K", 1), ("T", 2),
      ("T", 3), ("W", 4),
      ("W", 5), ("W", 6)
    )
    val pairs = sc.parallelize(data1, 3)
    //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
    //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
    var result = pairs.partitionBy(new RangePartitioner(2, pairs, true))
    result = pairs.partitionBy(new HashPartitioner(2))
    result.foreach(println)
  }
}