org.apache.spark.partial.PartialResult Scala Examples

The following examples show how to use org.apache.spark.partial.PartialResult. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: Converters.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package SparkER.Utilities

import SparkER.BlockBuildingMethods.TokenBlocking
import org.apache.spark.rdd.RDD
import SparkER.DataStructures._
import org.apache.spark.partial.PartialResult


  def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)
        }
    }

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorIDs.isEmpty) {
          BlockDirty(blockID, Array(profilesID))
        }
        else {
          BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs))
        }
    }

    blocks.filter(_.getComparisonSize() > 0).map(x => x)

  }
} 
Example 2
Source File: Converters.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Utilities

import BlockBuildingMethods.TokenBlocking
import org.apache.spark.rdd.RDD
import DataStructures._
import org.apache.spark.partial.PartialResult


  def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)
        }
    }

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorIDs.isEmpty) {
          BlockDirty(blockID, Array(profilesID))
        }
        else {
          BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs))
        }
    }

    blocks.filter(_.getComparisonSize() >= 1).map(x => x)

  }
} 
Example 3
Source File: Converters.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Utilities

import org.apache.spark.rdd.RDD
import DataStructures._
import org.apache.spark.partial.PartialResult


  def profilesBlockToBlocks(profilesBlocks : RDD[ProfileBlocks], separatorID : Long = -1) : RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)
        }
    }

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorID < 0){
          BlockDirty(blockID, (profilesID, Set.empty))
        }
        else{
          BlockClean(blockID, (profilesID.partition(_ <= separatorID)))
        }
    }

    blocks.filter(_.getComparisonSize() >=1).map(x => x)

  }
 } 
Example 4
Source File: DatasourceRDD.scala    From datasource-receiver   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.datasource.receiver

import org.apache.spark.partial.{BoundedDouble, CountEvaluator, PartialResult}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.streaming.datasource.config.ParametersUtils
import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetOperator}
import org.apache.spark.{Logging, Partition, TaskContext}

private[datasource]
class DatasourceRDD(
                     @transient sqlContext: SQLContext,
                     inputSentences: InputSentences,
                     datasourceParams: Map[String, String]
                   ) extends RDD[Row](sqlContext.sparkContext, Nil) with Logging with ParametersUtils {

  private var totalCalculated: Option[Long] = None

  private val InitTableName = "initTable"
  private val LimitedTableName = "limitedTable"
  private val TempInitQuery = s"select * from $InitTableName"

  val dataFrame = inputSentences.offsetConditions.fold(sqlContext.sql(inputSentences.query)) { case offset =>
    val parsedQuery = parseInitialQuery
    val conditionsSentence = offset.fromOffset.extractConditionSentence(parsedQuery)
    val orderSentence = offset.fromOffset.extractOrderSentence(parsedQuery, inverse = offset.limitRecords.isEmpty)
    val limitSentence = inputSentences.extractLimitSentence

    sqlContext.sql(parsedQuery + conditionsSentence + orderSentence + limitSentence)
  }

  private def parseInitialQuery: String = {
    if (inputSentences.query.toUpperCase.contains("WHERE") ||
      inputSentences.query.toUpperCase.contains("ORDER") ||
      inputSentences.query.toUpperCase.contains("LIMIT")
    ) {
      sqlContext.sql(inputSentences.query).registerTempTable(InitTableName)
      TempInitQuery
    } else inputSentences.query
  }

  def progressInputSentences: InputSentences = {
    if (!dataFrame.rdd.isEmpty()) {
      inputSentences.offsetConditions.fold(inputSentences) { case offset =>

        val offsetValue = if (offset.limitRecords.isEmpty)
          dataFrame.rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name))
        else {
          dataFrame.registerTempTable(LimitedTableName)
          val limitedQuery = s"select * from $LimitedTableName order by ${offset.fromOffset.name} " +
            s"${OffsetOperator.toInverseOrderOperator(offset.fromOffset.operator)} limit 1"

          sqlContext.sql(limitedQuery).rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name))
        }

        inputSentences.copy(offsetConditions = Option(offset.copy(fromOffset = offset.fromOffset.copy(
          value = Option(offsetValue),
          operator = OffsetOperator.toProgressOperator(offset.fromOffset.operator)))))
      }
    } else inputSentences
  }

  
  override def isEmpty(): Boolean = {
    totalCalculated.fold {
      withScope {
        partitions.length == 0 || take(1).length == 0
      }
    } { total => total == 0L }
  }

  override def getPartitions: Array[Partition] = dataFrame.rdd.partitions

  override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = dataFrame.rdd.compute(thePart, context)

  override def getPreferredLocations(thePart: Partition): Seq[String] = dataFrame.rdd.preferredLocations(thePart)
} 
Example 5
Source File: RDDSamplers.scala    From sparkplug   with MIT License 5 votes vote down vote up
package springnz.sparkplug.testkit

import java.lang.Math._

import com.typesafe.scalalogging.LazyLogging
import org.apache.spark.partial.{ BoundedDouble, PartialResult }
import org.apache.spark.rdd.RDD
import springnz.sparkplug.util.SerializeUtils

import scala.reflect.ClassTag

object RDDSamplers extends LazyLogging {
  def identitySampler[A: ClassTag](rdd: RDD[A]): RDD[A] = rdd

  def shrinkingSampler[A: ClassTag](sampleParams: RDDShrinkingSamplerParams = sourceRDDParams)(rdd: RDD[A]): RDD[A] =
    shrinkingSample(rdd, sampleParams)

  def takeSampler[A: ClassTag](count: Int, partitions: Int = -1)(rdd: RDD[A]): RDD[A] = {
    val sc = rdd.sparkContext
    val parts = if (partitions > 0) partitions else sc.defaultParallelism
    sc.parallelize(rdd.take(count), parts)
  }

  val sourceRDDParams = RDDShrinkingSamplerParams(
    testerFraction = 0.0001,
    scaleParam = 3000.0,
    scalePower = 0.30102999566398,
    minimum = 1000000.0,
    sequential = false)

  val derivedRDDParams = RDDShrinkingSamplerParams(
    testerFraction = 0.1,
    scaleParam = 1.0,
    scalePower = 1.0,
    minimum = 1000000.0,
    sequential = false)

  private[sparkplug] def shrinkFactor(
    testerFraction: Double,
    scaleParam: Double,
    scalePower: Double,
    minimum: Double,
    testerLength: Double): Double = {

    val fullLength = testerLength / testerFraction
    val calcFrac = Math.pow(fullLength, scalePower) / fullLength * scaleParam
    // don't bother shrinking to less than the minimum, but cap at 1.0
    min(if (minimum > 0) max(calcFrac, minimum / fullLength) else calcFrac, 1.0)
  }

  private[sparkplug] def shrinkingSample[A: ClassTag](rdd: RDD[A], params: RDDShrinkingSamplerParams): RDD[A] = {

    def getSample(params: RDDShrinkingSamplerParams): RDD[A] = {
      val approxSize: PartialResult[BoundedDouble] = rdd.countApprox(60000, 0.95)
      val sampleLength = approxSize.initialValue.mean * params.testerFraction
      if (sampleLength < 50) {
        // take a bigger shrinkingSample
        val updatedTesterFraction = params.testerFraction * 50 / sampleLength
        getSample(params.copy(testerFraction = updatedTesterFraction))
      } else {
        val sample = rdd.take(sampleLength.toInt)
        val tester = SerializeUtils.serialize(sample)
        val sampleFraction = shrinkFactor(params.testerFraction, params.scaleParam, params.scalePower,
          params.minimum, tester.length)
        val fullCount = sample.length / params.testerFraction

        val reSampled = if (params.sequential)
          rdd.sparkContext.parallelize(rdd.take((fullCount * sampleFraction).toInt), 10)
        else
          rdd.sample(withReplacement = true, sampleFraction, 0)
        reSampled
      }
    }

    if (params.scaleParam == 1.0 && params.scalePower == 1.0) {
      logger.info("Not sampling RDD since scaleParam==scalePower==1.0")
      rdd
    } else {
      logger.info(s"Sampling RDD with $params ...")
      getSample(params)
    }
  }

  case class RDDShrinkingSamplerParams(
      testerFraction: Double,
      scaleParam: Double,
      scalePower: Double,
      minimum: Double,
      sequential: Boolean) {

    def withSequential(newSequential: Boolean): Unit = {
      RDDShrinkingSamplerParams(testerFraction, scaleParam, scalePower, minimum, newSequential)
    }
  }

} 
Example 6
Source File: DoubleDCFunctions.scala    From spark-flow   with Apache License 2.0 5 votes vote down vote up
package com.bloomberg.sparkflow.dc

import org.apache.spark.partial.{BoundedDouble, PartialResult}
import org.apache.spark.util.StatCounter


class DoubleDCFunctions(self: DC[Double]) {

  def sum: DR[Double] = {
    self.mapToResult(_.sum)
  }

  def stats: DR[StatCounter] = {
    self.mapToResult(_.stats)
  }

  def mean: DR[Double] = {
    self.mapToResult(_.mean)
  }

  def variance: DR[Double] = {
    self.mapToResult(_.variance)
  }

  def stdev: DR[Double] = {
    self.mapToResult(_.stdev)
  }

  def sampleStdev: DR[Double] = {
    self.mapToResult(_.sampleStdev)
  }

  def sampleVariance: DR[Double] = {
    self.mapToResult(_.sampleVariance)
  }

  //  Experimental
  def meanApprox(timeout: Long,
                 confidence: Double = 0.95): DR[PartialResult[BoundedDouble]] = {
    self.mapToResult(_.meanApprox(timeout, confidence))
  }

  //  Experimental
  def sumApprox(timeout: Long,
                confidence: Double = 0.95): DR[PartialResult[BoundedDouble]] = {
    self.mapToResult(_.sumApprox(timeout, confidence))
  }

  def histogram(bucketCount: Int): DR[(Array[Double], Array[Long])] = {
    self.mapToResult(_.histogram(bucketCount))
  }

  def histogram(buckets: Array[Double], evenBuckets: Boolean = false): DR[Array[Long]] = {
    self.mapToResult(_.histogram(buckets, evenBuckets))
  }

}