org.apache.spark.util.SizeEstimator Scala Examples

The following examples show how to use org.apache.spark.util.SizeEstimator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: PrimitiveVectorSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util.collection

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.SizeEstimator

class PrimitiveVectorSuite extends SparkFunSuite {

  test("primitive value") {
    val vector = new PrimitiveVector[Int]

    for (i <- 0 until 1000) {
      vector += i
      assert(vector(i) === i)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i)
    }
  }

  test("non-primitive value") {
    val vector = new PrimitiveVector[String]

    for (i <- 0 until 1000) {
      vector += i.toString
      assert(vector(i) === i.toString)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i.toString)
    }
  }

  test("ideal growth") {
    val vector = new PrimitiveVector[Long](initialSize = 1)
    vector += 1
    for (i <- 1 until 1024) {
      vector += i
      assert(vector.size === i + 1)
      assert(vector.capacity === Integer.highestOneBit(i) * 2)
    }
    assert(vector.capacity === 1024)
    vector += 1024
    assert(vector.capacity === 2048)
  }

  test("ideal size") {
    val vector = new PrimitiveVector[Long](8192)
    for (i <- 0 until 8192) {
      vector += i
    }
    assert(vector.size === 8192)
    assert(vector.capacity === 8192)
    val actualSize = SizeEstimator.estimate(vector)
    val expectedSize = 8192 * 8
    // Make sure we are not allocating a significant amount of memory beyond our expected.
    // Due to specialization wonkiness, we need to ensure we don't have 2 copies of the array.
    assert(actualSize < expectedSize * 1.1)
  }

  test("resizing") {
    val vector = new PrimitiveVector[Long]
    for (i <- 0 until 4097) {
      vector += i
    }
    assert(vector.size === 4097)
    assert(vector.capacity === 8192)
    vector.trim()
    assert(vector.size === 4097)
    assert(vector.capacity === 4097)
    vector.resize(5000)
    assert(vector.size === 4097)
    assert(vector.capacity === 5000)
    vector.resize(4000)
    assert(vector.size === 4000)
    assert(vector.capacity === 4000)
    vector.resize(5000)
    assert(vector.size === 4000)
    assert(vector.capacity === 5000)
    for (i <- 0 until 4000) {
      assert(vector(i) == i)
    }
    intercept[IllegalArgumentException] {
      vector(4000)
    }
  }
} 
Example 2
Source File: PrimitiveVectorSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util.collection

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.SizeEstimator

class PrimitiveVectorSuite extends SparkFunSuite {

  test("primitive value") {
    val vector = new PrimitiveVector[Int]

    for (i <- 0 until 1000) {
      vector += i
      assert(vector(i) === i)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i)
    }
  }

  test("non-primitive value") {
    val vector = new PrimitiveVector[String]

    for (i <- 0 until 1000) {
      vector += i.toString
      assert(vector(i) === i.toString)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i.toString)
    }
  }

  test("ideal growth") {
    val vector = new PrimitiveVector[Long](initialSize = 1)
    vector += 1
    for (i <- 1 until 1024) {
      vector += i
      assert(vector.size === i + 1)
      assert(vector.capacity === Integer.highestOneBit(i) * 2)
    }
    assert(vector.capacity === 1024)
    vector += 1024
    assert(vector.capacity === 2048)
  }

  test("ideal size") {
    val vector = new PrimitiveVector[Long](8192)
    for (i <- 0 until 8192) {
      vector += i
    }
    assert(vector.size === 8192)
    assert(vector.capacity === 8192)
    val actualSize = SizeEstimator.estimate(vector)
    val expectedSize = 8192 * 8
    // Make sure we are not allocating a significant amount of memory beyond our expected.
    // Due to specialization wonkiness, we need to ensure we don't have 2 copies of the array.
    assert(actualSize < expectedSize * 1.1)
  }

  test("resizing") {
    val vector = new PrimitiveVector[Long]
    for (i <- 0 until 4097) {
      vector += i
    }
    assert(vector.size === 4097)
    assert(vector.capacity === 8192)
    vector.trim()
    assert(vector.size === 4097)
    assert(vector.capacity === 4097)
    vector.resize(5000)
    assert(vector.size === 4097)
    assert(vector.capacity === 5000)
    vector.resize(4000)
    assert(vector.size === 4000)
    assert(vector.capacity === 4000)
    vector.resize(5000)
    assert(vector.size === 4000)
    assert(vector.capacity === 5000)
    for (i <- 0 until 4000) {
      assert(vector(i) == i)
    }
    intercept[IllegalArgumentException] {
      vector(4000)
    }
  }
} 
Example 3
Source File: PrimitiveVectorSuite.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util.collection

import org.scalatest.FunSuite

import org.apache.spark.util.SizeEstimator

class PrimitiveVectorSuite extends FunSuite {

  test("primitive value") {
    val vector = new PrimitiveVector[Int]

    for (i <- 0 until 1000) {
      vector += i
      assert(vector(i) === i)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i)
    }
  }

  test("non-primitive value") {
    val vector = new PrimitiveVector[String]

    for (i <- 0 until 1000) {
      vector += i.toString
      assert(vector(i) === i.toString)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i.toString)
    }
  }

  test("ideal growth") {
    val vector = new PrimitiveVector[Long](initialSize = 1)
    vector += 1
    for (i <- 1 until 1024) {
      vector += i
      assert(vector.size === i + 1)
      assert(vector.capacity === Integer.highestOneBit(i) * 2)
    }
    assert(vector.capacity === 1024)
    vector += 1024
    assert(vector.capacity === 2048)
  }

  test("ideal size") {
    val vector = new PrimitiveVector[Long](8192)
    for (i <- 0 until 8192) {
      vector += i
    }
    assert(vector.size === 8192)
    assert(vector.capacity === 8192)
    val actualSize = SizeEstimator.estimate(vector)
    val expectedSize = 8192 * 8
    // Make sure we are not allocating a significant amount of memory beyond our expected.
    // Due to specialization wonkiness, we need to ensure we don't have 2 copies of the array.
    assert(actualSize < expectedSize * 1.1)
  }

  test("resizing") {
    val vector = new PrimitiveVector[Long]
    for (i <- 0 until 4097) {
      vector += i
    }
    assert(vector.size === 4097)
    assert(vector.capacity === 8192)
    vector.trim()
    assert(vector.size === 4097)
    assert(vector.capacity === 4097)
    vector.resize(5000)
    assert(vector.size === 4097)
    assert(vector.capacity === 5000)
    vector.resize(4000)
    assert(vector.size === 4000)
    assert(vector.capacity === 4000)
    vector.resize(5000)
    assert(vector.size === 4000)
    assert(vector.capacity === 5000)
    for (i <- 0 until 4000) {
      assert(vector(i) == i)
    }
    intercept[IllegalArgumentException] {
      vector(4000)
    }
  }
} 
Example 4
Source File: PrimitiveVectorSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util.collection

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.SizeEstimator

class PrimitiveVectorSuite extends SparkFunSuite {

  test("primitive value") {
    val vector = new PrimitiveVector[Int]

    for (i <- 0 until 1000) {
      vector += i
      assert(vector(i) === i)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i)
    }
  }

  test("non-primitive value") {
    val vector = new PrimitiveVector[String]

    for (i <- 0 until 1000) {
      vector += i.toString
      assert(vector(i) === i.toString)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i.toString)
    }
  }

  test("ideal growth") {
    val vector = new PrimitiveVector[Long](initialSize = 1)
    vector += 1
    for (i <- 1 until 1024) {
      vector += i
      assert(vector.size === i + 1)
      assert(vector.capacity === Integer.highestOneBit(i) * 2)
    }
    assert(vector.capacity === 1024)
    vector += 1024
    assert(vector.capacity === 2048)
  }

  test("ideal size") {
    val vector = new PrimitiveVector[Long](8192)
    for (i <- 0 until 8192) {
      vector += i
    }
    assert(vector.size === 8192)
    assert(vector.capacity === 8192)
    val actualSize = SizeEstimator.estimate(vector)
    val expectedSize = 8192 * 8
    // Make sure we are not allocating a significant amount of memory beyond our expected.
    // Due to specialization wonkiness, we need to ensure we don't have 2 copies of the array.
    assert(actualSize < expectedSize * 1.1)
  }

  test("resizing") {
    val vector = new PrimitiveVector[Long]
    for (i <- 0 until 4097) {
      vector += i
    }
    assert(vector.size === 4097)
    assert(vector.capacity === 8192)
    vector.trim()
    assert(vector.size === 4097)
    assert(vector.capacity === 4097)
    vector.resize(5000)
    assert(vector.size === 4097)
    assert(vector.capacity === 5000)
    vector.resize(4000)
    assert(vector.size === 4000)
    assert(vector.capacity === 4000)
    vector.resize(5000)
    assert(vector.size === 4000)
    assert(vector.capacity === 5000)
    for (i <- 0 until 4000) {
      assert(vector(i) == i)
    }
    intercept[IllegalArgumentException] {
      vector(4000)
    }
  }
} 
Example 5
Source File: PrimitiveVectorSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util.collection

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.SizeEstimator

class PrimitiveVectorSuite extends SparkFunSuite {

  test("primitive value") {
    val vector = new PrimitiveVector[Int]

    for (i <- 0 until 1000) {
      vector += i
      assert(vector(i) === i)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i)
    }
  }

  test("non-primitive value") {
    val vector = new PrimitiveVector[String]

    for (i <- 0 until 1000) {
      vector += i.toString
      assert(vector(i) === i.toString)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i.toString)
    }
  }

  test("ideal growth") {
    val vector = new PrimitiveVector[Long](initialSize = 1)
    vector += 1
    for (i <- 1 until 1024) {
      vector += i
      assert(vector.size === i + 1)
      assert(vector.capacity === Integer.highestOneBit(i) * 2)
    }
    assert(vector.capacity === 1024)
    vector += 1024
    assert(vector.capacity === 2048)
  }

  test("ideal size") {
    val vector = new PrimitiveVector[Long](8192)
    for (i <- 0 until 8192) {
      vector += i
    }
    assert(vector.size === 8192)
    assert(vector.capacity === 8192)
    val actualSize = SizeEstimator.estimate(vector)
    val expectedSize = 8192 * 8
    // Make sure we are not allocating a significant amount of memory beyond our expected.
    // Due to specialization wonkiness, we need to ensure we don't have 2 copies of the array.
    assert(actualSize < expectedSize * 1.1)
  }

  test("resizing") {
    val vector = new PrimitiveVector[Long]
    for (i <- 0 until 4097) {
      vector += i
    }
    assert(vector.size === 4097)
    assert(vector.capacity === 8192)
    vector.trim()
    assert(vector.size === 4097)
    assert(vector.capacity === 4097)
    vector.resize(5000)
    assert(vector.size === 4097)
    assert(vector.capacity === 5000)
    vector.resize(4000)
    assert(vector.size === 4000)
    assert(vector.capacity === 4000)
    vector.resize(5000)
    assert(vector.size === 4000)
    assert(vector.capacity === 5000)
    for (i <- 0 until 4000) {
      assert(vector(i) == i)
    }
    intercept[IllegalArgumentException] {
      vector(4000)
    }
  }
} 
Example 6
Source File: SizeTracker.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util.collection

import scala.collection.mutable

import org.apache.spark.util.SizeEstimator


  def estimateSize(): Long = {
    assert(samples.nonEmpty) //不能为空
    //bytePerUpdate作为最近平均每次更新,
    //用当前的update次数减去最后一个Sample的update次数,然后乘以bytePerUpdate,结果加上最后一个Sample记录的大小
    val extrapolatedDelta = bytesPerUpdate * (numUpdates - samples.last.numUpdates)
    (samples.last.size + extrapolatedDelta).toLong
  }
}

private object SizeTracker {
  case class Sample(size: Long, numUpdates: Long)
} 
Example 7
Source File: PrimitiveVectorSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util.collection

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.SizeEstimator

class PrimitiveVectorSuite extends SparkFunSuite {

  test("primitive value") {//原始值
    val vector = new PrimitiveVector[Int]

    for (i <- 0 until 1000) {
      vector += i
      assert(vector(i) === i)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i)
    }
  }

  test("non-primitive value") {//非原始值
    val vector = new PrimitiveVector[String]

    for (i <- 0 until 1000) {
      vector += i.toString
      assert(vector(i) === i.toString)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i.toString)
    }
  }

  test("ideal growth") {//理想的增长
    val vector = new PrimitiveVector[Long](initialSize = 1)
    vector += 1
    for (i <- 1 until 1024) {
      vector += i
      assert(vector.size === i + 1)
      assert(vector.capacity === Integer.highestOneBit(i) * 2)
    }
    assert(vector.capacity === 1024)
    vector += 1024
    assert(vector.capacity === 2048)
  }

  test("ideal size") {//理想的大小
    val vector = new PrimitiveVector[Long](8192)
    for (i <- 0 until 8192) {
      vector += i
    }
    assert(vector.size === 8192)
    assert(vector.capacity === 8192)
    val actualSize = SizeEstimator.estimate(vector)
    val expectedSize = 8192 * 8
    // Make sure we are not allocating a significant amount of memory beyond our expected.
    //确保我们分配的内存量没有超出的预期
    // Due to specialization wonkiness, we need to ensure we don't have 2 copies of the array.
    assert(actualSize < expectedSize * 1.1)
  }

  test("resizing") {//调整大小
    val vector = new PrimitiveVector[Long]
    for (i <- 0 until 4097) {
      vector += i
    }
    assert(vector.size === 4097)
    assert(vector.capacity === 8192)
    vector.trim()
    assert(vector.size === 4097)
    assert(vector.capacity === 4097)
    vector.resize(5000)
    assert(vector.size === 4097)
    assert(vector.capacity === 5000)
    vector.resize(4000)
    assert(vector.size === 4000)
    assert(vector.capacity === 4000)
    vector.resize(5000)
    assert(vector.size === 4000)
    assert(vector.capacity === 5000)
    for (i <- 0 until 4000) {
      assert(vector(i) == i)
    }
    intercept[IllegalArgumentException] {
      vector(4000)
    }
  }
} 
Example 8
Source File: PrimitiveVectorSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util.collection

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.SizeEstimator

class PrimitiveVectorSuite extends SparkFunSuite {

  test("primitive value") {
    val vector = new PrimitiveVector[Int]

    for (i <- 0 until 1000) {
      vector += i
      assert(vector(i) === i)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i)
    }
  }

  test("non-primitive value") {
    val vector = new PrimitiveVector[String]

    for (i <- 0 until 1000) {
      vector += i.toString
      assert(vector(i) === i.toString)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i.toString)
    }
  }

  test("ideal growth") {
    val vector = new PrimitiveVector[Long](initialSize = 1)
    vector += 1
    for (i <- 1 until 1024) {
      vector += i
      assert(vector.size === i + 1)
      assert(vector.capacity === Integer.highestOneBit(i) * 2)
    }
    assert(vector.capacity === 1024)
    vector += 1024
    assert(vector.capacity === 2048)
  }

  test("ideal size") {
    val vector = new PrimitiveVector[Long](8192)
    for (i <- 0 until 8192) {
      vector += i
    }
    assert(vector.size === 8192)
    assert(vector.capacity === 8192)
    val actualSize = SizeEstimator.estimate(vector)
    val expectedSize = 8192 * 8
    // Make sure we are not allocating a significant amount of memory beyond our expected.
    // Due to specialization wonkiness, we need to ensure we don't have 2 copies of the array.
    assert(actualSize < expectedSize * 1.1)
  }

  test("resizing") {
    val vector = new PrimitiveVector[Long]
    for (i <- 0 until 4097) {
      vector += i
    }
    assert(vector.size === 4097)
    assert(vector.capacity === 8192)
    vector.trim()
    assert(vector.size === 4097)
    assert(vector.capacity === 4097)
    vector.resize(5000)
    assert(vector.size === 4097)
    assert(vector.capacity === 5000)
    vector.resize(4000)
    assert(vector.size === 4000)
    assert(vector.capacity === 4000)
    vector.resize(5000)
    assert(vector.size === 4000)
    assert(vector.capacity === 5000)
    for (i <- 0 until 4000) {
      assert(vector(i) == i)
    }
    intercept[IllegalArgumentException] {
      vector(4000)
    }
  }
} 
Example 9
Source File: PrimitiveVectorSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.util.collection

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.SizeEstimator

class PrimitiveVectorSuite extends SparkFunSuite {

  test("primitive value") {
    val vector = new PrimitiveVector[Int]

    for (i <- 0 until 1000) {
      vector += i
      assert(vector(i) === i)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i)
    }
  }

  test("non-primitive value") {
    val vector = new PrimitiveVector[String]

    for (i <- 0 until 1000) {
      vector += i.toString
      assert(vector(i) === i.toString)
    }

    assert(vector.size === 1000)
    assert(vector.size == vector.length)
    intercept[IllegalArgumentException] {
      vector(1000)
    }

    for (i <- 0 until 1000) {
      assert(vector(i) == i.toString)
    }
  }

  test("ideal growth") {
    val vector = new PrimitiveVector[Long](initialSize = 1)
    vector += 1
    for (i <- 1 until 1024) {
      vector += i
      assert(vector.size === i + 1)
      assert(vector.capacity === Integer.highestOneBit(i) * 2)
    }
    assert(vector.capacity === 1024)
    vector += 1024
    assert(vector.capacity === 2048)
  }

  test("ideal size") {
    val vector = new PrimitiveVector[Long](8192)
    for (i <- 0 until 8192) {
      vector += i
    }
    assert(vector.size === 8192)
    assert(vector.capacity === 8192)
    val actualSize = SizeEstimator.estimate(vector)
    val expectedSize = 8192 * 8
    // Make sure we are not allocating a significant amount of memory beyond our expected.
    // Due to specialization wonkiness, we need to ensure we don't have 2 copies of the array.
    assert(actualSize < expectedSize * 1.1)
  }

  test("resizing") {
    val vector = new PrimitiveVector[Long]
    for (i <- 0 until 4097) {
      vector += i
    }
    assert(vector.size === 4097)
    assert(vector.capacity === 8192)
    vector.trim()
    assert(vector.size === 4097)
    assert(vector.capacity === 4097)
    vector.resize(5000)
    assert(vector.size === 4097)
    assert(vector.capacity === 5000)
    vector.resize(4000)
    assert(vector.size === 4000)
    assert(vector.capacity === 4000)
    vector.resize(5000)
    assert(vector.size === 4000)
    assert(vector.capacity === 5000)
    for (i <- 0 until 4000) {
      assert(vector(i) == i)
    }
    intercept[IllegalArgumentException] {
      vector(4000)
    }
  }
} 
Example 10
Source File: JoinOptimizerChromosome.scala    From bdg-sequila   with Apache License 2.0 5 votes vote down vote up
package org.biodatageeks.sequila.rangejoins.optimizer

import jdk.nashorn.internal.ir.debug.ObjectSizeCalculator
import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.util.SizeEstimator
import org.biodatageeks.sequila.rangejoins.IntervalTree.{Interval, IntervalWithRow}
import org.biodatageeks.sequila.rangejoins.optimizer.RangeJoinMethod.RangeJoinMethod


class JoinOptimizerChromosome(spark: SparkSession, rdd: RDD[(String,Interval[Int],InternalRow)], rddCount : Long) {

  val logger =  Logger.getLogger(this.getClass.getCanonicalName)
  val maxBroadcastSize = spark.sqlContext
    .getConf("spark.biodatageeks.rangejoin.maxBroadcastSize","0") match {
    case "0" => 0.1*scala.math.max((spark.sparkContext.getConf.getSizeAsBytes("spark.driver.memory","0")),1024*(1024*1024)) //defaults 128MB or 0.1 * Spark Driver's memory
    case _ => spark.sqlContext.getConf("spark.biodatageeks.rangejoin.maxBroadcastSize").toLong }
  val estBroadcastSize = estimateBroadcastSize(rdd,rddCount)


   private def estimateBroadcastSize(rdd: RDD[(String,Interval[Int],InternalRow)], rddCount: Long): Long = {
     try{
       (ObjectSizeCalculator.getObjectSize(rdd.first()) * rddCount) /10
     }
     catch {
       case e @ (_ : NoClassDefFoundError | _ : ExceptionInInitializerError ) => {
         logger.warn("Method ObjectSizeCalculator.getObjectSize not available falling back to Spark methods")
         SizeEstimator.estimate(rdd.first()) * rddCount
       }
     }
     //FIXME: Do not know why the size ~10x the actual size is- Spark row representation or getObject size in bits???
  }

  def debugInfo = {
    s"""
       |Broadcast structure size is ~ ${math.rint(100*estBroadcastSize/1024.0)/100} kb
       |spark.biodatageeks.rangejoin.maxBroadcastSize is set to ${(maxBroadcastSize/1024).toInt} kb"
       |Using ${getRangeJoinMethod.toString} join method
     """.stripMargin
  }

  private def estimateRDDSizeSpark(rdd: RDD[(String,Interval[Int],InternalRow)]): Long = {
    math.round(SizeEstimator.estimate(rdd)/1024.0)
  }

  
  def getRangeJoinMethod : RangeJoinMethod ={

    if (estimateBroadcastSize(rdd, rddCount) <= maxBroadcastSize)
      RangeJoinMethod.JoinWithRowBroadcast
    else
      RangeJoinMethod.TwoPhaseJoin

  }



} 
Example 11
Source File: JoinOptimizer.scala    From bdg-sequila   with Apache License 2.0 5 votes vote down vote up
package org.biodatageeks.sequila.rangejoins.optimizer

import jdk.nashorn.internal.ir.debug.ObjectSizeCalculator
import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.util.SizeEstimator
import org.biodatageeks.sequila.rangejoins.IntervalTree.IntervalWithRow
import org.biodatageeks.sequila.rangejoins.optimizer.RangeJoinMethod.RangeJoinMethod


class JoinOptimizer(sc: SparkContext, rdd: RDD[IntervalWithRow[Int]], rddCount : Long) {

  val logger =  Logger.getLogger(this.getClass.getCanonicalName)

  val maxBroadcastSize = sc
    .getConf
    .getOption("spark.biodatageeks.rangejoin.maxBroadcastSize") match {
      case Some(size) => size.toLong
      case _ => 0.1*scala.math.max((sc.getConf.getSizeAsBytes("spark.driver.memory","0")).toLong,1024*(1024*1024)) //defaults 128MB or 0.1 * Spark Driver's memory
    }
   val estBroadcastSize = estimateBroadcastSize(rdd,rddCount)


   private def estimateBroadcastSize(rdd: RDD[IntervalWithRow[Int]], rddCount: Long): Long = {
     try{
       (ObjectSizeCalculator.getObjectSize(rdd.first()) * rddCount) /10
     }
     catch {
       case e @ (_ : NoClassDefFoundError | _ : ExceptionInInitializerError )  => {
         logger.warn("Method ObjectSizeCalculator.getObjectSize not available falling back to Spark methods")
         SizeEstimator.estimate(rdd.first()) * rddCount
       }
     }
     //FIXME: Do not know why the size ~10x the actual size is- Spark row representation or getObject size in bits???
  }

  def debugInfo = {
    s"""
       |Broadcast structure size is ~ ${estBroadcastSize/1024} kb
       |spark.biodatageeks.rangejoin.maxBroadcastSize is set to ${maxBroadcastSize/1024} kb"
       |Using ${getRangeJoinMethod.toString} join method
     """.stripMargin
  }

  private def estimateRDDSizeSpark(rdd: RDD[IntervalWithRow[Int]]): Long = {
    math.round(SizeEstimator.estimate(rdd)/1024.0)
  }

  
  def getRangeJoinMethod : RangeJoinMethod ={

    if (estimateBroadcastSize(rdd, rddCount) <= maxBroadcastSize)
      RangeJoinMethod.JoinWithRowBroadcast
    else
      RangeJoinMethod.TwoPhaseJoin

  }



}