org.apache.spark.rdd.RDD Scala Examples

Example 1
package io.univalence.deltaqa.kpialgebra

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import shapeless.contrib.spire._
import spire.algebra._
import spire.implicits._

import scala.reflect.ClassTag

case class DeltaPart[T: AdditiveMonoid](
  count: Long,
  part: T

case class DeltaCommon[T: AdditiveMonoid](
  count: Long,
  countZero: Long,
  diff: T,
  error: T,
  left: T,
  right: T

case class Delta[L: AdditiveMonoid, R: AdditiveMonoid, C: AdditiveMonoid](
  left: DeltaPart[L],
  right: DeltaPart[R],
  common: DeltaCommon[C]

object KpiAlgebra {

  def computeCommon[LRC: AdditiveAbGroup: MultiplicativeSemigroup](left: LRC, right: LRC): DeltaCommon[LRC] = {
    val diff  = left - right
    val error = diff * diff
      count     = 1,
      countZero = if (diff == Monoid.additive[LRC].id) 1 else 0,
      diff      = diff,
      error     = error,
      left      = left,
      right     = right

  def monoid[LM: AdditiveMonoid, RM: AdditiveMonoid, LRC: AdditiveMonoid]: Monoid[Delta[LM, RM, LRC]] =
    Monoid.additive[Delta[LM, RM, LRC]]

  def compare[
    K: ClassTag,
    L: ClassTag,
    R: ClassTag,
    LM: AdditiveMonoid: ClassTag,
    RM: AdditiveMonoid: ClassTag,
    LRC: AdditiveAbGroup: MultiplicativeSemigroup: ClassTag
    left: RDD[(K, L)],
    right: RDD[(K, R)]
  )(flm: L => LM, frm: R => RM, flc: L => LRC, frc: R => LRC): Delta[LM, RM, LRC] = {

    val map: RDD[Delta[LM, RM, LRC]] = left
        case (_, (Some(l), None)) =>
          monoid[LM, RM, LRC].id
            .copy(left = DeltaPart(count = 1, part = flm(l)))
        case (_, (None, Some(r))) =>
          monoid[LM, RM, LRC].id
            .copy(right = DeltaPart(count = 1, part = frm(r)))
        case (_, (Some(l), Some(r))) =>
          monoid[LM, RM, LRC].id.copy(common = computeCommon(flc(l), frc(r)))

    map.reduce((x, y) => monoid[LM, RM, LRC].op(x, y))

case class KpiLeaf(l1: Long, l2: Long, l3: Long)

object KpiAlgebraTest {

  def main(args: Array[String]) {
    val sc = new SparkContext(new SparkConf().setMaster("local[*]").setAppName("smoketest"))

    val parallelize: RDD[(Int, Int)] = sc.parallelize((1 to 4).zipWithIndex)


    // Delta(DeltaPart(0,0),DeltaPart(0,0),DeltaCommon(4,4,0,0,6,6))

    val p2: RDD[(Int, KpiLeaf)] =
      sc.parallelize((1 to 4)).map(_ -> KpiLeaf(1, 2, 3))

    import spire.implicits._
    import shapeless.contrib.spire._

    ////println(((, p2)(identity, identity, identity, identity))

Example 2
Source File: Test1.scala    From BigData-News   with Apache License 2.0 7 votes vote down vote up
package com.vita.spark.test

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Test1 {
  def main(args: Array[String]): Unit = {

    val conf: SparkConf = new SparkConf()
    val sc: SparkContext = new SparkContext(conf)
    val list: List[String] = List("张无忌", "赵敏", "周芷若")
    val rdd: RDD[String] = sc.parallelize(list)

    val list1: List[(Int, String)] = List((1, "东方不败"), (2, "令狐冲"), (3, "林平之"))
    val list2: List[(Int, Int)] = List((1, 99), (2, 98), (3, 97))

    val rdd1: RDD[(Int, String)] = sc.parallelize(list1)
    val rdd2: RDD[(Int, Int)] = sc.parallelize(list2)
    rdd1.join(rdd2).foreach(x => println("学号: " + x._1 + "名字:" + x._2._1 + " 分数:" + x._2._2))

Example 3
Source File: SqlNetworkWordCount.scala    From drizzle-spark   with Apache License 2.0 6 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming

import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext, Time}

object SparkSessionSingleton {

  @transient  private var instance: SparkSession = _

  def getInstance(sparkConf: SparkConf): SparkSession = {
    if (instance == null) {
      instance = SparkSession
// scalastyle:on println 
Example 4
Source File: LocalTableScanExec.scala    From drizzle-spark   with Apache License 2.0 6 votes vote down vote up
package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.execution.metric.SQLMetrics

case class LocalTableScanExec(
    output: Seq[Attribute],
    rows: Seq[InternalRow]) extends LeafExecNode {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  private val unsafeRows: Array[InternalRow] = {
    if (rows.isEmpty) {
    } else {
      val proj = UnsafeProjection.create(output, output) => proj(r).copy()).toArray

  private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1),

  private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism)

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows") { r =>
      numOutputRows += 1

  override protected def stringArgs: Iterator[Any] = {
    if (rows.isEmpty) {
      Iterator("<empty>", output)
    } else {

  override def executeCollect(): Array[InternalRow] = {

  override def executeTake(limit: Int): Array[InternalRow] = {
    val taken = unsafeRows.take(limit)
Example 5
Source File: GraphGeneration.scala    From Mastering-Machine-Learning-with-Spark-2.x   with MIT License 6 votes vote down vote up
package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.lib.TriangleCount
import org.apache.spark.graphx.util.GraphGenerators
import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object GraphGeneration extends App {

  val conf = new SparkConf()
    .setAppName("Graph generation")
  val sc = new SparkContext(conf)

  val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt")

  val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map {
    line =>
      val field = line.split(" ")
      (field(0).toLong, field(1).toLong)
  val edgeTupleGraph = Graph.fromEdgeTuples(
    rawEdges=rawEdges, defaultValue="")

  val gridGraph = GraphGenerators.gridGraph(sc, 5, 5)
  val starGraph = GraphGenerators.starGraph(sc, 11)
  val logNormalGraph  = GraphGenerators.logNormalGraph(
    sc, numVertices = 20, mu=1, sigma = 3

  val actorGraph = GraphLoader.edgeListFile(
    sc, "./ca-hollywood-2009.txt", true

  val actorComponents = actorGraph.connectedComponents().cache

  val clusterSizes
    v => (v._2, 1)).reduceByKey(_ + _)

  val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt")
  val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5)

  val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges()
  val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut)

  val triangles = TriangleCount.runPreCanonicalized(partitionedGraph)

  val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001)
  actorPrGraph.vertices.reduce((v1, v2) => {
    if (v1._2 > v2._2) v1 else v2

  actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println) >= 62).count

Example 6
Source File: PipePrintSampleCorpus.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.corpus

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipePrintSampleCorpus(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping)
  extends PipeElementPassthrough[RDD[Tuple]] with PipeSampler {

  def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: CorpusContext => {
        val sample: Array[Tuple] = pc.corpus.takeSample(false, count)
        val table: Seq[Seq[String]] = createTupleTable(sample)"Corpus sample of " + sample.size + " tuples: ")


object PipePrintSampleCorpus {

  def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = {
    new PipePrintSampleCorpus(count)

Example 7
Source File: PipeContextReadCorpus.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.corpus

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.pipe.PipeElement
import scala.reflect.ClassTag

class PipeContextReadCorpus[A: ClassTag] extends PipeElement[RDD[A], RDD[Tuple]] {

  def step(input: RDD[A])(implicit pipeContext: AbstractPipeContext): RDD[Tuple] = {
    pipeContext match {
      case pc: CorpusContext => pc.corpus

object PipeContextReadCorpus {

  def apply[A]() = new PipeContextReadCorpus()

Example 8
Source File: PipeAnalyseCorpus.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.corpus

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.TupleArray
import de.unihamburg.vsis.sddf.visualisation.model.ReadingModel
import de.unihamburg.vsis.sddf.pipe.context.ResultContext

class PipeAnalyseCorpus
  extends PipeElementPassthrough[RDD[Tuple]]
  with Serializable {

  override val _analysable = new ReadingModel

  def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: ResultContext => {
        pc.readingModel = Some(_analysable)
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")


object PipeAnalyseCorpus {
  def apply() = {
    new PipeAnalyseCorpus()
Example 9
Source File: PipeStoreInContextGoldstandard.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeStoreInContextGoldstandard extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {
  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext => pc.goldstandard = input

object PipeStoreInContextGoldstandard {
  def apply() = new PipeStoreInContextGoldstandard()

Example 10
Source File: PipeReaderGoldstandardIdsPairs.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.IdConverterBasic
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeReaderGoldstandardIdsPairs(
    separator: Char = ',',
    idIndex1: Int = 0,
    idIndex2: Int = 1,
    idConverter: IdConverter = IdConverterBasic)
  extends PipeElement[RDD[String], RDD[SymPair[Long]]] {

  override def step(inputRdd: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[SymPair[Long]] = { => {
      val parts = line.split(separator)
      val tupleId1 = idConverter.convert(parts(idIndex1).replaceAll("[^0-9]",""))
      val tupleId2 = idConverter.convert(parts(idIndex2).replaceAll("[^0-9]",""))
      new SymPair(tupleId1, tupleId2)


object PipeReaderGoldstandardIdsPairs {
  def apply(
      separator: Char = ',',
      idIndex1: Int = 0,
      idIndex2: Int = 1,
      idConverter: IdConverter = IdConverterBasic) = {
    new PipeReaderGoldstandardIdsPairs(separator, idIndex1, idIndex2, idConverter)

Example 11
Source File: PipeReaderGoldstandard.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.Pipeline
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.IdConverterBasic
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

object PipeReaderGoldstandardPairs {

  def apply(
    separator: Char = ',',
    idIndex1: Int = 0,
    idIndex2: Int = 1,
    idConverter: IdConverter = IdConverterBasic): Pipeline[RDD[String], RDD[SymPair[Tuple]]] = {
    PipeReaderGoldstandardIdsPairs(separator, idIndex1, idIndex2, idConverter)


object PipeReaderGoldstandardCluster {

  def apply(
      separator: Char = ',',
      clusterIdIndex: Int = 0,
      tupleIdIndex: Int = 1,
      idConverter: IdConverter = IdConverterBasic): Pipeline[RDD[String], RDD[SymPair[Tuple]]] = {
    PipeReaderGoldstandardIdsCluster(separator, clusterIdIndex, tupleIdIndex, idConverter)

Example 12
Source File: PipeAnalyseGoldstandardCluster.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.visualisation.model.GoldstandardClusterModel

class PipeAnalyseGoldstandardCluster extends PipeElementPassthrough[RDD[Seq[Long]]] {

  override val _analysable = new GoldstandardClusterModel

  def substep(input: RDD[Seq[Long]])(implicit pipeContext: AbstractPipeContext): Unit = {
    _analysable.goldstandard = input
    pipeContext match {
      case pc: ResultContext => {
        pc.goldstandardModelCluster = Some(_analysable)
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")


object PipeAnalyseGoldstandardCluster {

  def apply() = new PipeAnalyseGoldstandardCluster()

Example 13
Source File: PipePrintSampleGoldstandard.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipePrintSampleGoldstandard(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping)
  extends PipeElementPassthrough[RDD[Tuple]] with PipeSampler {

  def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext => {
        val sample: Array[SymPair[Tuple]] = pc.goldstandard.takeSample(false, count)
        val table: Seq[Seq[String]] = createSymPairTable(sample)
       "Goldstandard sample of " + sample.size + " tuples: ")


object PipePrintSampleGoldstandard {
  def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = {
    new PipePrintSampleGoldstandard(count)

Example 14
Source File: PipeReaderGoldstandardClusterOutput.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import java.util.regex.PatternSyntaxException

import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.SddfContext.rddToRdd
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.IdConverterBasic
import de.unihamburg.vsis.sddf.reading.SymPair

class PipeReaderGoldstandardClusterOutput(
  separator: Char = ',',
  clusterIdIndex: Int = 0,
  tupleIdIndex: Int = 1,
  idConverter: IdConverter = IdConverterBasic)
  extends PipeElement[RDD[String], RDD[Seq[Long]]] {

  override def step(inputRdd: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Long]] = {
    // parse tuple ids
    val clusterIdTupleIdRdd = => {
      val parts = line.split(separator)
      val tupleId = idConverter.convert(parts(tupleIdIndex).replaceAll("[^0-9]",""))
      val clusterId = idConverter.convert(parts(clusterIdIndex).replaceAll("[^0-9]",""))
      (clusterId, tupleId)


object PipeReaderGoldstandardClusterOutput {
  def apply(
      separator: Char = ',',
      clusterIdIndex: Int = 0,
      tupleIdIndex: Int = 1,
      idConverter: IdConverter = IdConverterBasic) = {
    new PipeReaderGoldstandardClusterOutput(separator, clusterIdIndex, tupleIdIndex, idConverter)

Example 15
Source File: PipeAnalyseGoldstandard.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.GoldstandardModel

class PipeAnalyseGoldstandard extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {

  override val _analysable = new GoldstandardModel

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    _analysable.goldstandard = input
    pipeContext match {
      case pc: ResultContext => {
        pc.goldstandardModel = Some(_analysable)
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")


object PipeAnalyseGoldstandard {

  def apply() = new PipeAnalyseGoldstandard()

Example 16
Source File: PipePrintHeadGoldstandard.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipePrintHeadGoldstandard(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping)
  extends PipeElementPassthrough[RDD[SymPair[Tuple]]] with PipeSampler {

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext => {
        val sample: Array[SymPair[Tuple]] = pc.goldstandard.take(count)
        val table: Seq[Seq[String]] = createSymPairTable(sample)
       "Goldstandard sample of " + sample.size + " tuples: ")


object PipePrintHeadGoldstandard {
  def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = {
    new PipePrintHeadGoldstandard(count)

Example 17
Source File: PipePrintHeadTuple.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.print

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table

class PipePrintHeadTuple(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping)
  extends PipeElementPassthrough[RDD[Tuple]] with PipeSampler {

  def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = {
    val sample: Array[Tuple] = input.take(count)
    val table: Seq[Seq[String]] = createTupleTable(sample)"Sample of " + sample.size + " tuples: ")


object PipePrintHeadTuple {

  def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = {
    new PipePrintHeadTuple(count)

Example 18
Source File: PipeWordcount.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.examples

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._
import com.rockymadden.stringmetric.StringMetric
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeWordcount()
  extends PipeElement[RDD[String], RDD[(String, Int)]] {

  def step(input: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[(String, Int)] = {
    // flatten the collection of word arrays
    val words = input.flatMap(line => line.split(" "))
    // initialize the counter of each word with one
    val wordsWithCounter = => (word, 1))
    // add up all counters of the same word
    wordsWithCounter.reduceByKey(_ + _)


// companion object for a better usability
object PipeWordcount {
  def apply() = new PipeWordcount()
Example 19
Source File: AbstractPipeClusteringGraph.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.clustering

import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.similarity.aggregator.Mean
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

abstract class AbstractPipeClusteringGraph
  extends PipeElement[RDD[(SymPair[Tuple], Array[Double])], RDD[Set[Tuple]]]
  with Serializable {
  def cluster(graph: Graph[Tuple, Double]): RDD[Set[Tuple]]

  def step(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): RDD[Set[Tuple]] = {
    val duplicatePairsWithSimilarity =
      pair => (pair._1, Mean.agrSimilarity(pair._2))
    val edges: RDD[Edge[Double]] =
      pair => { Edge(,, pair._2) }

    // TODO optimize: it would be nice to build the graph only by using edge triplets
    // but as far as I know that's not possible
    val verticesNotUnique: RDD[(VertexId, Tuple)] =
      tuplePair => Seq(tuplePair._1, tuplePair._2)
    ).map(tuple => (, tuple))

    // delete all duplicate vertices
    val vertices = verticesNotUnique.distinct()

    // The edge type Boolean is just a workaround because no edge types are needed
    val graph: Graph[Tuple, Double] = Graph.apply(vertices, edges, null)

Example 20
Source File: PipeAnalyseClustering.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.clustering

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.ClusterModel
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext

class PipeAnalyseClustering extends PipeElementPassthrough[RDD[Set[Tuple]]] {

  override val _analysable = new ClusterModel

  def substep(input: RDD[Set[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext with ResultContext => {
        _analysable.clusters = input
        _analysable.goldstandard = pc.goldstandard
        pc.clusterModel = Some(_analysable)
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")


object PipeAnalyseClustering {
  def apply() = {
    new PipeAnalyseClustering()
Example 21
Source File: PipeWriterTupleCluster.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.writing


import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeWriterTupleCluster(file: File, separator: Char = ',')
  extends PipeElementPassthrough[RDD[Set[Tuple]]] {

  def substep(input: RDD[Set[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    val writer = new TupleWriterFile(file, separator)
    // TODO write tuples to hdfs in parallel and merge them afterwards
    val collected = input.collect()
    collected.foreach(set => {
      set.foreach(tuple => {


object PipeWriterTupleCluster {

  def apply(file: File, separator: Char = ',') = {
    new PipeWriterTupleCluster(file, separator)

Example 22
Source File: PipeWriterTuplePairs.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.writing

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeWriterTuplePairs(file: File, separator: Char = ',') extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    val writer = new TupleWriterFile(file, separator)
    val collected = input.collect()
    collected.foreach(pair => {


object PipeWriterTuplePairs {
  def apply(file: File, separator: Char = ',') = {
    new PipeWriterTuplePairs(file, separator)

Example 23
Source File: ClusterWriterCsvFile.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.writing


import org.apache.spark.rdd.RDD

import com.opencsv.CSVWriter

import de.unihamburg.vsis.sddf.reading.Tuple

class ClusterWriterCsvFile(file: File, separator: Char = ',') {

  // create folders

  def this(path: String) = {
    this(new File(path))

  def this(folder: String, file: String) = {
    this(new File(folder, file))

  def write(clusterRdd: RDD[Set[Tuple]]): Unit = {
    val collectedClusters = clusterRdd.collect()
    val writer = new CSVWriter(new FileWriter(file), separator);
    // feed in your array (or convert your data to an array)
    collectedClusters.foreach(set => {
      val tupleIdSet: Set[String] = =>
      val tupleIdArray: Array[String] = tupleIdSet.toArray
Example 24
Source File: TupleWriterFile.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.writing


import org.apache.spark.rdd.RDD

import com.opencsv.CSVWriter

import de.unihamburg.vsis.sddf.reading.Tuple

class TupleWriterFile(file: File, separator: Char = ',') {

  val writer = new CSVWriter(new FileWriter(file), separator);

  def writeTuple[A <: Tuple](tuple: A): Unit = {
    writer.writeNext( +:

  def close() = {
  def blankLine() = {
  def writeTuple[A <: Tuple](tuples: Traversable[A]): Unit = {
    tuples.foreach(tuple => {
      writer.writeNext( +:

  def writeTuple[A <: Tuple](tuples: RDD[A]): Unit = {
    val collectedTuples = tuples.collect()
    collectedTuples.foreach(tuple => {
      writer.writeNext( +:
Example 25
Source File: DummyIndexer.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable

class PipeIndexerDummy extends IndexingPipe {

  override val name = "DummyIndexer"
  def step(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[SymPair[Tuple]] = {
    val cartesian = input.cartesian(input).map(new SymPair(_))
    // filter identities like (a,a) and symmetric duplicates like (a,b) && (b,a)
    cartesian.filter(pair => pair._1 != pair._2).distinct()

object PipeIndexerDummy {
  def apply() = {
    new PipeIndexerDummy()
Example 26
Source File: PipeAnalyseIndexer.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.IndexingModel

class PipeAnalyseIndexer extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {

  override val _analysable: IndexingModel = new IndexingModel

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: CorpusContext with ResultContext => {
        _analysable.pairs = input
        _analysable.corpus = pc.corpus
        pc.indexingModel = Some(_analysable)
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")


object PipeAnalyseIndexer {
  def apply() = new PipeAnalyseIndexer
Example 27
Source File: PipeIndexerSortedNeighborhood.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.indexing.blocking.PipeBlockerSortedNeighborhood
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

object PipeIndexerSortedNeighborhood {
  def apply(windowSize: Int = 10)(implicit bkvBuilder: BlockingKeyBuilder) = {

  def calcPairCount(elementCount: Int, windowSize: Int): Int = {
    val windowCount = elementCount - windowSize + 1
    val firstWindowPairs = (windowSize * (windowSize - 1)) / 2
    val lastWindowPairs = (windowCount - 1) * (windowSize - 1)
    firstWindowPairs + lastWindowPairs
Example 28
Source File: PipeAnalyseIndexerExtended.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.IndexingModelExtended

class PipeAnalyseIndexerExtended extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {

  override val _analysable: IndexingModelExtended = new IndexingModelExtended

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext with CorpusContext with ResultContext => {
        _analysable.pairs = input
        _analysable.goldstandard = pc.goldstandard
        _analysable.corpus = pc.corpus
        pc.indexingModel = Some(_analysable)
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")


object PipeAnalyseIndexerExtended {
  def apply() = new PipeAnalyseIndexerExtended
Example 29
Source File: PipeAnalyseBlocker.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.visualisation.model.IndexingModel
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.visualisation.model.BlockingModel

class PipeAnalyseBlocker extends PipeElementPassthrough[RDD[Seq[Tuple]]] {

  override val _analysable: BlockingModel = new BlockingModel

  def substep(input: RDD[Seq[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext with CorpusContext with ResultContext => {
        _analysable.blocks = input
        pc.blockingModel = Some(_analysable)
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")


object PipeAnalyseBlocker {
  def apply() = new PipeAnalyseBlocker
Example 30
Source File: PipeBlockerStandard.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable

  def step(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Tuple]] = {
    val bkvTuplePairs: RDD[(String, Tuple)] = => (bkvBuilder.buildBlockingKey(t), t))
    val keyBlocks: RDD[(String, Iterable[Tuple])] = bkvTuplePairs.groupByKey > 1)

  @transient override val _analysable = new AlgoAnalysable
  _analysable.algo = this =
  override val name = "StandardBlocker"
  override val paramMap = Map("BlockingKeyBuilder" -> bkvBuilder)


object PipeBlockerStandard {

  def apply(implicit bkvBuilder: BlockingKeyBuilder) = {
    new PipeBlockerStandard()

Example 31
Source File: PipeBlockerSortedNeighborhood.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.mllib.rdd.RDDFunctions.fromRDD
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable

class PipeBlockerSortedNeighborhood(windowSize: Int = 10)(implicit bkvBuilder: BlockingKeyBuilder)
    extends BlockingPipe
    with Parameterized {

  def step(tuples: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Tuple]] = {
    val bkvTuplePairs: RDD[(String, Tuple)] = => (bkvBuilder.buildBlockingKey(t), t))
    val sortedPairs = bkvTuplePairs.sortByKey().map(_._2)

  @transient override val _analysable = new AlgoAnalysable
  _analysable.algo = this =
  override val name = "SortedNeighborhoodBlocker"
  override val paramMap = Map("windowSize" -> windowSize,
    "BlockingKeyBuilder" -> bkvBuilder)


object PipeBlockerSortedNeighborhood {

  def apply(windowSize: Int = 10)(implicit bkvBuilder: BlockingKeyBuilder) = {
    new PipeBlockerSortedNeighborhood(windowSize)

Example 32
Source File: PipeBlockerSuffixArray.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable

  def filterBlocks(suffixTuplePair: (String, Seq[Tuple])): Boolean = {
    val tupleCount = suffixTuplePair._2.length
    if (tupleCount > maximumBlockSize) {
    } else if (tupleCount < 2) {
    } else {

object PipeBlockerSuffixArray {

  def apply(minimumSuffixLength: Int = 6, maximumBlockSize: Int = 12)(
    implicit bkvBuilder: BlockingKeyBuilder) = {
    new PipeBlockerSuffixArray(minimumSuffixLength, maximumBlockSize)

Example 33
Source File: SddfPipeContext.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.pipe.context

import org.apache.spark.rdd.RDD
import org.joda.time.Period

import de.unihamburg.vsis.sddf.visualisation.ModelRouter
import de.unihamburg.vsis.sddf.visualisation.logger.ModelRouterLogging

class SddfPipeContext(
    val name: String = "Unnamed Pipeline",
    modelRouter: ModelRouter = ModelRouterLogging)
  extends AbstractPipeContext(modelRouter)
  with CorpusContext
  with GoldstandardContext
  with ResultContext {
  var runtime: Option[Period] = None
  var filepath: Option[String] = None
  val persistedRDDs = new scala.collection.mutable.HashMap[String, RDD[_]]()
Example 34
Source File: PipeOptimizeUnpersist.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.pipe.optimize

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext

class PipeOptimizeUnpersist[A](rddname: String) extends PipeElementPassthrough[RDD[A]] {

  def substep(input: RDD[A])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: SddfPipeContext => {
        val rddOption = pc.persistedRDDs.get(rddname)
        if (rddOption.isDefined) {
          analysable.values += ("RDD unpersisted" -> rddname)
        } else {
          log.warn("Can't unpersist RDD with the name " + rddname)
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")

object PipeOptimizeUnpersist {

  def apply[A](rddname: String) = {
    new PipeOptimizeUnpersist[A](rddname)

Example 35
Source File: PipeOptimizePersistAndName.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.pipe.optimize

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext

class PipeOptimizePersistAndName[A](rddname: String = null, newLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends PipeElementPassthrough[RDD[A]] {
  def substep(input: RDD[A])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: SddfPipeContext => {
        if(rddname != null){
 = rddname
          pc.persistedRDDs += (rddname -> input)
          analysable.values += ("name" -> rddname)
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")

object PipeOptimizePersistAndName {
  def apply[A](rddname: String = null, newLevel: StorageLevel = StorageLevel.MEMORY_ONLY) = {
    new PipeOptimizePersistAndName[A](rddname, newLevel)

Example 36
Source File: RddUtils.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.sparkextensions

import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD

object RddUtils {

  def securlyZipRdds[A, B: ClassTag](rdd1: RDD[A], rdd2: RDD[B]): RDD[(A, B)] = {
    val rdd1Repartitioned = rdd1.repartition(1)
    val rdd2Repartitioned = rdd2.repartition(1)
    val (rdd1Balanced, rdd2Balanced) = balanceRddSizes(rdd1Repartitioned, rdd2Repartitioned)

  def balanceRddSizes[A, B](rdd1: RDD[A], rdd2: RDD[B]): (RDD[A], RDD[B]) = {
    val rdd1count = rdd1.count()
    val rdd2count = rdd2.count()
    val difference = math.abs(rdd1count - rdd2count).toInt
    if (rdd1count > rdd2count) {
      (removeRandomElements(rdd1, difference), rdd2)
    } else if (rdd2count > rdd1count) {
      (rdd1, removeRandomElements(rdd2, difference))
    } else {
      (rdd1, rdd2)

  def removeRandomElements[A](rdd: RDD[A], numberOfElements: Int): RDD[A] = {
    val sample: Array[A] = rdd.takeSample(false, numberOfElements)
    val set: Set[A] = Set(sample: _*)
    rdd.filter(x => if (set.contains(x)) false else true)

Example 37
Source File: PipePrintHeadFalsePositives.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

class PipePrintHeadFalsePositives(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends AbstractPipePrintFalseTuples(count) {

  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = {

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = {
    def logMessage(count: Int): String = {
    "Printing " + count + " first false positives. (duplicate pairs which were not found)"


object PipePrintHeadFalsePositives {
  def apply(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipePrintHeadFalsePositives(count)

Example 38
Source File: PipeClassificationNaiveBayes.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import scala.beans.BeanInfo
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import org.apache.spark.mllib.classification.NaiveBayesModel

class PipeClassificationNaiveBayes(lambda: Double = 1.0) extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("lambda", lambda))

    def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    val model = NaiveBayes.train(trainingData, lambda)

    log.debug("Classification Model:" + model)
    log.debug("Classification Model labels :" + model.labels.mkString(" "))
    log.debug("Classification Model pi:     " + model.pi.mkString(" "))
    log.debug("Classification Model theta:  " + model.theta.foreach(_.mkString(" ")))

    // Marking Missing Values as Not Equal (0) => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))


object PipeClassificationNaiveBayes {
  def apply(lambda: Double = 1.0) = {
    new PipeClassificationNaiveBayes(lambda)
Example 39
Source File: PipeClassificationTrainingDataGenerator.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import scala.compat.Platform

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.similarity.SimilarityCalculator
import de.unihamburg.vsis.sddf.sparkextensions.RddUtils.securlyZipRdds
import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeClassificationTrainingDataGenerator(
  truePositiveCount: Int = 500,
  trueNegativeCount: Int = 500)(
  implicit featureMeasures: Array[(Int, StringMetric[Double])])
  extends PipeElement[SymPairSim, (SymPairSim, RDD[LabeledPoint])]
  with Logging {

  override def step(input: SymPairSim)(implicit pipeContext: AbstractPipeContext) = {
    pipeContext match {
      case pc: GoldstandardContext with CorpusContext => {
        var truePositiveFraction = truePositiveCount / pc.goldstandard.count.toDouble
        var trueNegativeFraction = trueNegativeCount / pc.corpus.count.toDouble
        log.debug("True positive pair fraction taken from the gold standard for training purposes: " + truePositiveFraction)
        log.debug("True negative pair fraction taken from the corpus for training purposes: " + trueNegativeFraction)
        if (truePositiveFraction > 1.0) {
          truePositiveFraction = 1.0
          log.debug("True positive pair fraction limited to 1.0")
        if (trueNegativeFraction > 1.0) {
          trueNegativeFraction = 1.0
          log.debug("True negative pair fraction limited to 1.0")
        val result = generateTrainingData(pc.corpus, pc.goldstandard,
          truePositiveFraction, trueNegativeFraction)
        (input, result)
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")

object PipeClassificationTrainingDataGenerator {

  val All = -1
  def apply(
      truePositiveCount: Int = 500,
      trueNegativeCount: Int = 500)(
      implicit featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipeClassificationTrainingDataGenerator(truePositiveCount, trueNegativeCount)

Example 40
Source File: PipeClassificationDecisionTree.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable
import de.unihamburg.vsis.sddf.Parameterized
import org.apache.spark.mllib.classification.ClassificationModel

class PipeClassificationDecisionTree(
    impurity: String = "gini",
    maxDepth: Int = 5,
    maxBins: Int = 32)
  extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("impurity", impurity), ("maxDepth", maxDepth), ("maxBins", maxBins))

  def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    val model = DecisionTree.trainClassifier(trainingData, numClasses = 2,
      categoricalFeaturesInfo = Map[Int, Int](), impurity, maxDepth, maxBins)

    log.debug("Decision Tree Model:" + model)
    log.debug("Decision Tree:" + model.toDebugString)

    // Marking Missing Values as Not Equal (0) => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))


object PipeClassificationDecisionTree {
  def apply(
    impurity: String = "gini",
    maxDepth: Int = 5,
    maxBins: Int = 32) = {
    new PipeClassificationDecisionTree(impurity, maxDepth, maxBins)
Example 41
Source File: PipeClassificationSvm.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import scala.beans.BeanInfo
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import org.apache.spark.mllib.classification.SVMWithSGD

class PipeClassificationSvm(numIterations: Int = 100) extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("numIterations", numIterations))

    def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    val model = SVMWithSGD.train(trainingData, numIterations)

    log.debug("Classification Model:" + model)

    // Marking Missing Values as Not Equal (0) => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))


object PipeClassificationSvm {
  def apply(numIterations: Int = 100) = {
    new PipeClassificationSvm(numIterations)
Example 42
Source File: PipePrintHeadFalseNegatives.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

class PipePrintHeadFalseNegatives(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends AbstractPipePrintFalseTuples(count) {

  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = {

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = {
  def logMessage(count: Int): String = {
    "Printing " + count + " first false negatives. (duplicate pairs which are no duplicates)"


object PipePrintHeadFalseNegatives {
  def apply(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipePrintHeadFalseNegatives(count)

Example 43
Source File: PipePrintSampleFalseNegatives.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

class PipePrintSampleFalseNegatives(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends AbstractPipePrintFalseTuples(count) {

  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = {

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = {
    falseTuplesWithSimilarity.takeSample(false, count)

  def logMessage(count: Int): String = {
    "Sampling " + count + " false negatives. (duplicate pairs which are no duplicates)"


object PipePrintSampleFalseNegatives {
  def apply(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping, 
    featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipePrintSampleFalseNegatives(count)

Example 44
Source File: PipeAnalyseClassificationTraining.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel

class PipeAnalyseClassificationTraining
  extends PipeElementPassthrough[(SymPairSim, RDD[LabeledPoint])] {

  override val _analysable: TrainingSetModel = new TrainingSetModel

  def substep(
      input: (SymPairSim, RDD[LabeledPoint]))(
      implicit pipeContext: AbstractPipeContext): Unit = {
    _analysable.trainingsSetLabeled = input._2
    pipeContext match {
      case pc: ResultContext => {
        pc.trainingSetModel = Some(_analysable)
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")


object PipeAnalyseClassificationTraining {

  def apply() = new PipeAnalyseClassificationTraining

Example 45
Source File: PipePrintSampleFalsePositives.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

class PipePrintSampleFalsePositives(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends AbstractPipePrintFalseTuples(count) {
  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = {

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = {
    falseTuplesWithSimilarity.takeSample(false, count)
  def logMessage(count: Int): String = {
    "Sampling " + count + " false positives. (duplicate pairs which were not found)"


object PipePrintSampleFalsePositives {
  def apply(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping, 
    featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipePrintSampleFalsePositives(count)

Example 46
Source File: AbstractPipeClassification.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable

abstract class AbstractPipeClassification()
  extends PipeElement[(SymPairSim, RDD[LabeledPoint]), SymPairSim]
  with Parameterized {

  override val _analysable = new AlgoAnalysable
  _analysable.algo = this

  def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)]

  def step(input: (SymPairSim, RDD[LabeledPoint]))(implicit pipeContext: AbstractPipeContext): SymPairSim = {
    pipeContext match {
      case pc: CorpusContext with GoldstandardContext => {

        val symPairSim = input._1
        val trainingsSet = input._2

        val prediction = trainModelAndClassify(trainingsSet, symPairSim)

        val duplicatePairs = prediction.filter(_._3 == Duplicate).map(tri => (tri._1, tri._2))

      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")

Example 47
Source File: AbstractPipePrintFalseTuples.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.SparkContext.rddToPairRDDFunctions
import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

abstract class AbstractPipePrintFalseTuples(
  count: Int)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends PipeElementPassthrough[RDD[(SymPair[Tuple], Array[Double])]]
  with PipeSampler {

  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]): RDD[SymPair[Tuple]]

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]): Array[(SymPair[Tuple], Array[Double])]

  def logMessage(count: Int): String

  def substep(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext => {

        val falseTuples = selectFalseTuples(pc.goldstandard,

        if (falseTuples.count > 0) {
          val dummyValue: RDD[(SymPair[Tuple], Int)] =, 1))
          val join: RDD[(SymPair[Tuple], (Int, Option[Array[Double]]))] = dummyValue.leftOuterJoin(input)
          val falsePositivesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])] = => {
            (pair._1, pair._2._2.getOrElse(Array()))

          val falseTuplesSample = filterFalseTuplesForOutput(falsePositivesWithSimilarity)

          val table = createSymPairSimVectorTable(falseTuplesSample)

        } else {


Example 48
Source File: ExactDuplicateFilter.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up


import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Id
import de.unihamburg.vsis.sddf.reading.corpus.PipeStoreInContextCorpus
import de.unihamburg.vsis.sddf.reading.corpus.PipePrintSampleCorpus
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.corpus.PipeReaderTupleCsv
import de.unihamburg.vsis.sddf.writing.TupleWriterFile

object ExactDuplicateFilter extends App with Logging {

  if (args.size == 1 && (new File(args(0))).exists()) {
    val conf = new SparkConf().setAppName("ExactDuplicateFilter")
    val sc = new SparkContext(conf)

    implicit val pipeContext = new SddfPipeContext
    val Content: (Int, String) = (0, "content")

    val featureMapping: Map[Int, String] = Map(Content)

    implicit val featureIdNameMapper = new FeatureIdNameMapping(featureMapping)

    val inputFileKey = "musicbrainz"

    // Parse Tuples
    val allFields: Seq[Int] = Seq(Content._1)
    val allFieldsWithId: Seq[Int] = Id +: allFields

    val parserPipe = new PipeTupleParserCsvIdContent(allFieldsWithId)
    val pipe = parserPipe.append(PipeStoreInContextCorpus()).append(PipePrintSampleCorpus())
    val result: RDD[Tuple] = parserPipe.output.get
    val resultCount = result.count"Lines parsed: " + resultCount)
    val distinct = result.distinct()
    val distinctCount = distinct.count"Distinct Lines Count: " + distinctCount)"Lines removed: " + (resultCount - distinctCount))
    val tupleWriter = new TupleWriterFile(new File(args(0) + ".distinct"))

  } else {
    println("Please provide a valid file path.")


class PipeTupleParserCsvIdContent(featureIds: Seq[Int]) extends PipeReaderTupleCsv(featureIds) {
  override def extractValues(line: String): Seq[String] = {
    val splitted = parser.parseLine(line)
    Seq(splitted.head, splitted.tail.mkString(","))
Example 49
Source File: PipeGoldstandardReaderClusterTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.reading.goldstandard

import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.goldstandard.PipeReaderGoldstandardIdToTuple
import de.unihamburg.vsis.sddf.reading.goldstandard.PipeReaderGoldstandardIdsCluster
import de.unihamburg.vsis.sddf.test.util.FixtureHelper
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext

class PipeReaderGoldstandardClusterTest
  extends FunSuite
  with LocalSparkContext
  with TestSddfPipeContext
  with FixtureHelper {

  test("test goldstandard tuple reading in cluster format") {
    // format clusterId, tupleId
    val input: RDD[String] = sc.parallelize(Seq("1,1", "2,2", "2,3"))
    val gsReaderPipe = PipeReaderGoldstandardIdsCluster()
    val gsIds = gsReaderPipe.output.get
    assert(gsIds.count() === 1)

    val tuples: Seq[Tuple] = initializeTuples(1, 3)
    pc.corpus = sc.parallelize(tuples)
    val gsconverterPipe = new PipeReaderGoldstandardIdToTuple
    val gsTuple = gsconverterPipe.output.get
    assert(gsTuple.count() === 1)

  test("test goldstandard id reading in cluster format") {
    // format clusterId, tupleId
    val input: RDD[String] = sc.parallelize(Seq("1,1", "2,2", "2,3"))
    val gsReaderPipe = PipeReaderGoldstandardIdsCluster()
    val result = gsReaderPipe.output.get
    assert(result.count() === 1)

  test("test goldstandard cluster reader from file") {
    val input = sc.textFile("src/test/resources/musicbrainz-1000.csv.dup")
    val gsReaderPipe = PipeReaderGoldstandardIdsCluster()
    val result = gsReaderPipe.output.get
    assert(result.collect().size === 13)

Example 50
Source File: StrongestPathClusteringTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.clustering

import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.clustering.PipeClusteringStrongestPath
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.test.util.FixtureHelper
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext

class StrongestPathClusteringTest
  extends FunSuite
  with LocalSparkContext
  with TestSddfPipeContext
  with FixtureHelper {

  test("simple cluster test") {
    val pair1 = (createTuplePair(1, 2), Array(0.4, 0.6))
    val pair2 = (createTuplePair(2, 4), Array(0.1, 0.2))
    val pair3 = (createTuplePair(4, 3), Array(0.6, 0.8))
    val pair4 = (createTuplePair(3, 1), Array(0.0, 0.2))

    val pairs: RDD[(SymPair[Tuple], Array[Double])] = sc.parallelize(Seq(pair1, pair2, pair3, pair4))
    val clusterer = new PipeClusteringStrongestPath
    val clusterResult: Array[Set[Tuple]] = clusterer.output.get.collect()

    val expectedResult = Array(Set(pair1._1._1, pair1._1._2), Set(pair3._1._1, pair3._1._2))
    assert(clusterResult === expectedResult)

Example 51
Source File: ClusterAnalyserTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.evaluation

import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.test.util.FixtureHelper
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.visualisation.model.ClusterModel

class ClusterAnalyserTest extends FunSuite with LocalSparkContext with FixtureHelper {
  test("Precission and recall test") {

    val analyser = new ClusterModel
    analyser.clusters = buildClusters()
    analyser.goldstandard = buildGoldstandard()

    assert(analyser.precision === 0.2857142857142857) // should be 2/7
    assert(analyser.recall === 0.6666666666666666) // should be 2/3


  def buildClusters(): RDD[Set[Tuple]] = {
    val cluster1 = initializeTuples(0, 2).toSet
    val cluster2 = initializeTuples(3, 4).toSet
    val cluster3 = initializeTuples(5, 7).toSet

    sc.parallelize(Seq(cluster1, cluster2, cluster3))

  def buildGoldstandard(): RDD[SymPair[Tuple]] = {
    val pair1 = createTuplePair(0, 1)
    val pair2 = createTuplePair(4, 7)
    val pair3 = createTuplePair(6, 7)

    sc.parallelize(Seq(pair1, pair2, pair3))
Example 52
Source File: SparkApiTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test

import org.apache.spark.rdd.RDD
import org.scalatest.Finders
import org.scalatest.FunSuite
import de.unihamburg.vsis.sddf.SddfContext.pairToInt
import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorRemoveRegex
import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorTrim
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Id
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Ignore
import de.unihamburg.vsis.sddf.reading.corpus.PipeReaderTupleCsv
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.MusicbrainzSchema

class SparkApiTest extends FunSuite with LocalSparkContext with MusicbrainzSchema {

  test("test rdd substraction") {

    val file1 = sc.textFile("src/test/resources/musicbrainz-10.csv.dup")
    val file2 = sc.textFile("src/test/resources/musicbrainz-10.csv.dup")
    val data1 = parseTuples(file1)
    assert(data1.count() === 10)
    val data2 = parseTuples(file2)
    assert(data2.count() === 10)
    val substraction = data1.subtract(data2)
    assert(substraction.count() === 0)

Example 53
Source File: PipeDecisionTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.classification

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.scalatest.BeforeAndAfterAll
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.classification.PipeClassificationDecisionTree
import de.unihamburg.vsis.sddf.classification.PipeClassificationNaiveBayes
import de.unihamburg.vsis.sddf.classification.PipeClassificationSvm
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext

class PipeClassificationTest extends FunSuite with LocalSparkContext with BeforeAndAfterAll{
  var input: (SymPairSim, RDD[LabeledPoint]) = _
  override def beforeAll() {
    val tuple1 = Tuple("test1","test1","test1") = 1
    val tuple2 = Tuple("test2","test2","test2") = 2
    val tuple3 = Tuple("hans","franz","wurst") = 3
    val symPairSim: SymPairSim = sc.parallelize(Seq(
      (new SymPair(tuple1, tuple2), Array(1D,1D,0D))
      ,(new SymPair(tuple2, tuple3), Array(0D,0D,1D))
    val trainingData: RDD[LabeledPoint] = sc.parallelize(Seq(
      LabeledPoint(label = Duplicate, features = Vectors.dense(Array(0.99,1.0,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.875,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.1)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.89,0.0)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.1,0.0,1.0)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.0,0.2,1.0)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.06,0.0,0.89)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.21,0.19,0.91)))
    input = (symPairSim, trainingData)

  override def afterAll() {
	test("naive bayes classification test") {
    val classificationPipe = new PipeClassificationNaiveBayes()
    implicit val pipeContext = new SddfPipeContext()
    val result =
    assert(result.count === 1)
  test("svm classification test") {
    val classificationPipe = new PipeClassificationSvm()
    implicit val pipeContext = new SddfPipeContext()
    val result =
    assert(result.count === 1)

  test("decision tree classification test") {
    val classificationPipe = new PipeClassificationDecisionTree()
    implicit val pipeContext = new SddfPipeContext()
    val result =
    assert(result.count === 1)

Example 54
Source File: MusicbrainzSchema.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.util

import org.apache.spark.rdd.RDD
import org.scalatest.Suite

import de.unihamburg.vsis.sddf.SddfContext.pairToInt
import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorRemoveRegex
import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorTrim
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Id
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Ignore
import de.unihamburg.vsis.sddf.reading.corpus.PipeReaderTupleCsv

trait MusicbrainzSchema extends TestSddfPipeContext { self: Suite =>

  val Number = (0, "number")
  val Title = (1, "title")
  val Length = (2, "length")
  val Artist = (3, "artist")
  val Album = (4, "album")
  val Year = (5, "year")
  val Language = (6, "language")

  val featureIdNameMapping = Map(Number, Title, Length, Artist, Album, Year, Language)

  implicit val featureIdNameMapper = new FeatureIdNameMapping(featureIdNameMapping)

  def parseTuples(input: RDD[String]) = {
    // Parse Tuples
    val allFields: Seq[Int] = Seq(Number, Title, Length, Artist, Album, Year, Language)
    val allFieldsWithId: Seq[Int] = Ignore +: Id +: Ignore +: allFields

    val pipe = PipeReaderTupleCsv(allFieldsWithId)
      .append(PipePreprocessorTrim(allFields: _*))
      .append(PipePreprocessorRemoveRegex("[^0-9]", Number, Year, Length))


Example 55
Source File: SortedNeighbourhoodBlockerTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.blocking

import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite
import org.scalatest.Matchers
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilderBasic
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.TupleArray
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext
import de.unihamburg.vsis.sddf.indexing.PipeIndexerSortedNeighborhood
import de.unihamburg.vsis.sddf.indexing.PipeIndexerSortedNeighborhood

class SortedNeighborhoodIndexingTest
  extends FunSuite
  with LocalSparkContext
  with TestSddfPipeContext
  with Matchers {

  test("testing whole Sorted Neighborhood Indexer") {
    val featureId = 1
    implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 6))

    val tuple1: Tuple = new TupleArray(1)
    tuple1.addFeature(0, "blubluba") = 1
    val tuple2: Tuple = new TupleArray(1)
    tuple2.addFeature(0, "blubluba") = 2
    val tuple3: Tuple = new TupleArray(1)
    tuple3.addFeature(0, "blubluba") = 3
    val tuple4: Tuple = new TupleArray(1)
    tuple4.addFeature(0, "blubluba") = 4
    val tuple5: Tuple = new TupleArray(1)
    tuple5.addFeature(0, "blubluba") = 5
    val tuples = sc.parallelize(Seq(tuple1, tuple2, tuple3, tuple4, tuple5))

    val indexer = PipeIndexerSortedNeighborhood(windowSize = 3)
    val blockingResult: RDD[SymPair[Tuple]] =
    assert(blockingResult.count === 7)

    val resultArray = blockingResult.collect()
    val expectedResult = Seq(
      new SymPair(tuple1, tuple2), new SymPair(tuple1, tuple3), new SymPair(tuple2, tuple3), new SymPair(tuple2, tuple4), new SymPair(tuple3, tuple4), new SymPair(tuple3, tuple5), new SymPair(tuple4, tuple5)

    resultArray should contain theSameElementsAs expectedResult

Example 56
Source File: SuffixArrayBlockingTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.blocking

import org.apache.spark.rdd.RDD
import org.scalatest.Finders
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.indexing.PipeIndexerSuffixArray
import de.unihamburg.vsis.sddf.indexing.blocking.PipeBlockerSuffixArray
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilderBasic
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.TupleArray
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext

class SuffixArrayIndexingTest extends FunSuite with LocalSparkContext with TestSddfPipeContext {

  test("testing suffix calculation") {
    val featureId = 0
    implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 2))

    val tuple1: Tuple = new TupleArray(1)
    tuple1.addFeature(0, "blockingkeyvalue") = 1
    val tuples: RDD[Tuple] = sc.parallelize(Seq(tuple1))

    val sab = PipeBlockerSuffixArray(minimumSuffixLength = 4, maximumBlockSize = 12)

    val suffixTuplePairs: Seq[(String, Tuple)] = sab.calcSuffixes(("blockingkeyvalue", tuple1))

    //    println("\n"))

    assert(suffixTuplePairs.length === 13)


  test("testing filter blocks") {
    val featureId = 0
    implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 2))

    val tuple1: Tuple = new TupleArray(1)
    tuple1.addFeature(0, "blockingkeyvalue") = 1
    val tuples = sc.parallelize(Seq(tuple1))

    val sab = new PipeBlockerSuffixArray(minimumSuffixLength = 4, maximumBlockSize = 4)

    val suffixTuplePair = ("bla", Seq(tuple1, tuple1, tuple1, tuple1, tuple1))
    assert(sab.filterBlocks(suffixTuplePair) === false)

    val suffixTuplePair2 = ("bla", Seq(tuple1, tuple1, tuple1, tuple1))
    assert(sab.filterBlocks(suffixTuplePair2) === true)

    val suffixTuplePair3 = ("bla", Seq(tuple1))
    assert(sab.filterBlocks(suffixTuplePair3) === false)

    val suffixTuplePair4 = ("bla", Seq(tuple1, tuple1))
    assert(sab.filterBlocks(suffixTuplePair4) === true)

  test("testing whole SAB") {
    val featureId = 0
    implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 6))

    val tuple1: Tuple = new TupleArray(1)
    tuple1.addFeature(0, "blubluba") = 1
    val tuple2: Tuple = new TupleArray(1)
    tuple2.addFeature(0, "blubluba") = 2
    val tuple3: Tuple = new TupleArray(1)
    tuple3.addFeature(0, "blubluba") = 3
    val tuples = sc.parallelize(Seq(tuple1, tuple2, tuple3))

    val sab = PipeIndexerSuffixArray(minimumSuffixLength = 4, maximumBlockSize = 12)
    val blockingResult: RDD[SymPair[Tuple]] =
    // print(blockingResult.collect().map(symPair => (,"\n"))
    assert(blockingResult.count === 3)

Example 57
Source File: BisectingKMeansModel.scala    From bisecting-kmeans   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.bisectingkmeans

import breeze.linalg.{Vector => BV, norm => breezeNorm}

import org.apache.spark.Logging
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD

  def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
    val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
    this.node.toLinkageMatrix.foreach {x =>
      val row = new java.util.ArrayList[java.lang.Double]()
Example 58
Source File: TestFFM.scala    From spark-ffm   with Apache License 2.0 5 votes vote down vote up
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification._
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.rdd.RDD

object TestFFM extends App {

  override def main(args: Array[String]): Unit = {

    val sc = new SparkContext(new SparkConf().setAppName("TESTFFM").setMaster("local[4]"))

    if (args.length != 8) {
      println("testFFM <train_file> <k> <n_iters> <eta> <lambda> " + "<normal> <random>")

    val data= sc.textFile(args(0)).map(_.split("\\s")).map(x => {
      val y = if(x(0).toInt > 0 ) 1.0 else -1.0
      val nodeArray: Array[(Int, Int, Double)] = x.drop(1).map(_.split(":")).map(x => {
        (x(0).toInt, x(1).toInt, x(2).toDouble)
      (y, nodeArray)
    val splits = data.randomSplit(Array(0.7, 0.3))
    val (training: RDD[(Double, Array[(Int, Int, Double)])], testing) = (splits(0), splits(1))

    //sometimes the max feature/field number would be different in training/testing dataset,
    // so use the whole dataset to get the max feature/field number
    val m = data.flatMap(x=>x._2).map(_._1).collect.reduceLeft(_ max _) //+ 1
    val n = data.flatMap(x=>x._2).map(_._2).collect.reduceLeft(_ max _) //+ 1

    val ffm: FFMModel = FFMWithAdag.train(training, m, n, dim = (args(6).toBoolean, args(7).toBoolean, args(1).toInt), n_iters = args(2).toInt,
      eta = args(3).toDouble, regParam = (args(4).toDouble, args(5).toDouble), normalization = false, false, "adagrad")

    val scores: RDD[(Double, Double)] = => {
      val p = ffm.predict(x._2)
      val ret = if (p >= 0.5) 1.0 else -1.0
      (ret, x._1)

    val metrics = new BinaryClassificationMetrics(scores)
    val auROC = metrics.areaUnderROC
    val auPRC = metrics.areaUnderPR
    val accuracy = scores.filter(x => x._1 == x._2).count().toDouble / scores.count()
    println(s"accuracy = $accuracy, Area under ROC = $auROC, Area under precision-recall curve = $auPRC")
Example 59
Source File: InferSchema.scala    From Linkis   with Apache License 2.0 5 votes vote down vote up
package com.webank.wedatasphere.spark.excel

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._

private[excel] object InferSchema {

  type CellType = Int

  private[excel] def inferField(typeSoFar: DataType, field: DataType): DataType = {
    // Defining a function to return the StringType constant is necessary in order to work around
    // a Scala compiler issue which leads to runtime incompatibilities with certain Spark versions;
    // see issue #128 for more details.
    def stringType(): DataType = {

    if (field == NullType) {
    } else {
      (typeSoFar, field) match {
        case (NullType, ct) => ct
        case (DoubleType, DoubleType) => DoubleType
        case (BooleanType, BooleanType) => BooleanType
        case (TimestampType, TimestampType) => TimestampType
        case (StringType, _) => stringType()
        case (_, _) => stringType()

  private val numericPrecedence: IndexedSeq[DataType] =
    IndexedSeq[DataType](ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, TimestampType)

  val findTightestCommonType: (DataType, DataType) => Option[DataType] = {
    case (t1, t2) if t1 == t2 => Some(t1)
    case (NullType, t1) => Some(t1)
    case (t1, NullType) => Some(t1)
    case (StringType, t2) => Some(StringType)
    case (t1, StringType) => Some(StringType)

    // Promote numeric types to the highest of the two and all numeric types to unlimited decimal
    case (t1, t2) if Seq(t1, t2).forall(numericPrecedence.contains) =>
      val index = numericPrecedence.lastIndexWhere(t => t == t1 || t == t2)

    case _ => None
Example 60
Source File: DatabaseInteraction.scala    From reactive-machine-learning-systems   with MIT License 5 votes vote down vote up
package com.reactivemachinelearning

import com.couchbase.spark._
import com.reactivemachinelearning.FeatureGeneration.{IntFeature, BooleanFeature, Feature}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object DatabaseInteraction extends App {

  // Configure Spark
  val conf = new SparkConf()
    .set("com.couchbase.bucket.default", "")

  // Generate The Context
  val sc = new SparkContext(conf)

  val rawSquawks: RDD[JsonDocument] = sc.couchbaseView(
    ViewQuery.from("squawks", "by_squawk_id"))


  def extract(rawSquawks: RDD[JsonDocument]): RDD[IntFeature] = {

  def transform(inputFeatures: RDD[IntFeature]): RDD[BooleanFeature] = {

  val trainableFeatures = transform(extract(rawSquawks))
Example 61
Source File: TestableQueueInputDStream.scala    From SparkUnitTestingExamples   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import{ObjectInputStream, ObjectOutputStream}

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.dstream.InputDStream

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

class TestableQueueInputDStream[T: ClassTag](
                                              ssc: StreamingContext,
                                              val queue: Queue[RDD[T]],
                                              oneAtATime: Boolean,
                                              defaultRDD: RDD[T]
                                              ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    queue.synchronized {
      if (oneAtATime && queue.nonEmpty) {
        buffer += queue.dequeue()
      } else {
        buffer ++= queue
    if (buffer.nonEmpty) {
      if (oneAtATime) {
      } else {
        Some(new UnionRDD(, buffer.toSeq))
    } else if (defaultRDD != null) {
    } else {

Example 62
Source File: StreamingUnitTest.scala    From SparkUnitTestingExamples   with Apache License 2.0 5 votes vote down vote up

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

import scala.collection.mutable.Queue

class StreamingUnitTest extends FunSuite with
BeforeAndAfterEach with BeforeAndAfterAll{

  @transient var sc: SparkContext = null
  @transient var ssc: StreamingContext = null

  override def beforeAll(): Unit = {

    val envMap = Map[String,String](("Xmx", "512m"))

    val sparkConfig = new SparkConf()
    sparkConfig.set("spark.broadcast.compress", "false")
    sparkConfig.set("spark.shuffle.compress", "false")
    sparkConfig.set("spark.shuffle.spill.compress", "false")
    sparkConfig.set("", "lzf")
    sc = new SparkContext("local[2]", "unit test", sparkConfig)
    ssc = new StreamingContext(sc, Milliseconds(200))

  override def afterAll(): Unit = {

  test("Streaming word count") {

    val firstBatchRDD = sc.parallelize(Seq("a", "b", "c"))
    val secondBatchRDD = sc.parallelize(Seq("a", "e"))
    val thirdBatchRDD = sc.parallelize(Seq("b", "c", "e", "f"))
    val forthBatchRDD = sc.parallelize(Seq("a", "e"))

    val queue = new Queue[RDD[String]]



    val startTime = System.currentTimeMillis()

    val dstream = new TestableQueueInputDStream(ssc, queue, true, sc.makeRDD(Seq[String](), 1))


    val batchTotals:DStream[(String, Int)] = => (r, 1)).reduceByKey(_ + _)

    val streamTotals = batchTotals.updateStateByKey(
      (seq:Seq[Int], opt:Option[Int]) => {
        if (!seq.isEmpty) {
          val totalCountForNew = seq.reduce(_ + _)
          if (opt.isEmpty) {
          } else {
            Option(opt.get + totalCountForNew)
        } else {

    streamTotals.foreachRDD(rdd => {



    val endTime = System.currentTimeMillis()

    val rddList = streamTotals.slice(new Time(startTime), new Time(endTime))

    assert(rddList(0).collect().filter(r => r._1.equals("a"))(0)._2 == 1)
    assert(rddList(1).collect().filter(r => r._1.equals("a"))(0)._2  == 2)
    assert(rddList(2).collect().filter(r => r._1.equals("a"))(0)._2  == 2)
    assert(rddList(3).collect().filter(r => r._1.equals("a"))(0)._2  == 3)
Example 63
Source File: SparkCassRDDFunctions.scala    From Spark2Cassandra   with Apache License 2.0 5 votes vote down vote up
package com.github.jparkie.spark.cassandra.rdd

import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.mapper.ColumnMapper
import com.datastax.spark.connector.writer.{ DefaultRowWriter, RowWriterFactory }
import com.datastax.spark.connector.{ AllColumns, ColumnSelector }
import com.github.jparkie.spark.cassandra.SparkCassBulkWriter
import com.github.jparkie.spark.cassandra.conf.{ SparkCassServerConf, SparkCassWriteConf }
import org.apache.spark.rdd.RDD

import scala.reflect.runtime.universe._

  def bulkLoadToCass(
    keyspaceName:        String,
    tableName:           String,
    columns:             ColumnSelector      = AllColumns,
    sparkCassWriteConf:  SparkCassWriteConf  = SparkCassWriteConf.fromSparkConf(internalSparkContext.getConf),
    sparkCassServerConf: SparkCassServerConf = SparkCassServerConf.fromSparkConf(internalSparkContext.getConf)
    connector: CassandraConnector = CassandraConnector(internalSparkContext.getConf),
    rwf: RowWriterFactory[T] = DefaultRowWriter.factory[T]): Unit = {
    val sparkCassBulkWriter = SparkCassBulkWriter(

    internalSparkContext.runJob(rdd, sparkCassBulkWriter.write _)
Example 64
Source File: PointCloudRelation.scala    From geotrellis-pointcloud   with Apache License 2.0 5 votes vote down vote up
package geotrellis.pointcloud.spark.datasource

import{Options => HadoopOptions}
import geotrellis.pointcloud.util.Filesystem
import geotrellis.proj4.CRS
import geotrellis.vector.Extent

import cats.implicits._
import io.pdal._
import io.circe.syntax._
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SQLContext}


import scala.collection.JavaConverters._

// This class has to be serializable since it is shipped over the network.
class PointCloudRelation(
  val sqlContext: SQLContext,
  path: String,
  options: HadoopOptions
) extends BaseRelation with TableScan with Serializable {

  @transient implicit lazy val sc: SparkContext = sqlContext.sparkContext

  // TODO: switch between HadoopPointCloudRDD and S3PointcCloudRDD
  lazy val isS3: Boolean = path.startsWith("s3")

  override def schema: StructType = {
    lazy val (local, fixedPath) =
      if(path.startsWith("s3") || path.startsWith("hdfs")) {
        val tmpDir = Filesystem.createDirectory()
        val remotePath = new Path(path)
        // copy remote file into local tmp dir
        val localPath = new File(tmpDir, remotePath.getName)
        HdfsUtils.copyPath(remotePath, new Path(s"file:///${localPath.getAbsolutePath}"), sc.hadoopConfiguration)
        (true, localPath.toString)
      } else (false, path)

    val localPipeline =
        .downField("filename").withFocus(_ => fixedPath.asJson)

    val pl = Pipeline(localPipeline.noSpaces)
    if (pl.validate()) pl.execute()
    val pointCloud = try {
    } finally {
      if(local) println(new File(fixedPath).delete)

    val rdd = HadoopPointCloudRDD(new Path(path), options)

    val md: (Option[Extent], Option[CRS]) =
        .map { case (header, _) => (, }
        .reduce { case ((e1, c), (e2, _)) => ((e1, e2).mapN(_ combine _), c) }

    val metadata = new MetadataBuilder().putString("metadata", md.asJson.noSpaces).build


  override def buildScan(): RDD[Row] = {
    val rdd = HadoopPointCloudRDD(new Path(path), options)
    rdd.flatMap { _._2.flatMap { pc => { k => Row(k: _*) } } }
Example 65
Source File: PointCloudToDem.scala    From geotrellis-pointcloud   with Apache License 2.0 5 votes vote down vote up
package geotrellis.pointcloud.spark.dem

import io.pdal._
import geotrellis.layer._
import geotrellis.raster._
import geotrellis.spark._
import geotrellis.util._
import geotrellis.vector._

import org.apache.spark.rdd.RDD

object PointCloudToDem {
  def apply[M: GetComponent[*, LayoutDefinition]](rdd: RDD[(SpatialKey, PointCloud)] with Metadata[M], tileDimensions: (Int, Int), options: PointToGrid.Options): RDD[(SpatialKey, Tile)] with Metadata[LayoutDefinition] =
    apply[M](rdd, options) { e => RasterExtent(e, tileDimensions._1, tileDimensions._2) }

  def apply[M: GetComponent[*, LayoutDefinition]](rdd: RDD[(SpatialKey, PointCloud)] with Metadata[M], cellSize: CellSize, options: PointToGrid.Options): RDD[(SpatialKey, Tile)] with Metadata[LayoutDefinition] =
   apply[M](rdd, options) { e => RasterExtent(e, cellSize) }

  def apply[M: GetComponent[*, LayoutDefinition]](rdd: RDD[(SpatialKey, PointCloud)] with Metadata[M], options: PointToGrid.Options)(createRE: Extent => RasterExtent): RDD[(SpatialKey, Tile)] with Metadata[LayoutDefinition] = {
    val layoutDefinition = rdd.metadata.getComponent[LayoutDefinition]
    val mapTransform = layoutDefinition.mapTransform

    val result =
        .mapPartitions({ partition =>
 { case (key, neighbors) =>
            val extent = mapTransform(key)
            val raster =
              PointToGrid.createRaster(, createRE(extent), options)
            (key, raster.tile)
        }, preservesPartitioning = true)

    ContextRDD(result, layoutDefinition)
Example 66
Source File: BufferUnionable.scala    From geotrellis-pointcloud   with Apache License 2.0 5 votes vote down vote up
package geotrellis.pointcloud.spark.buffer

import geotrellis.layer._

import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

object BufferUnionable {

  def apply[
    K: SpatialComponent,
    X <: { def union(other: Any): V },
    V: (? => X) : ClassTag
  ](rdd: RDD[(K, V)]): RDD[(K, V)] = {
      .flatMap({ case (key, data) =>
        val SpatialKey(col, row) = key

        for (deltaX <- -1 to +1; deltaY <- -1 to +1) yield {
          if (deltaX == 0 && deltaY == 0)
            (SpatialKey(col + deltaX, row + deltaY), (key, data, true))
            (SpatialKey(col + deltaX, row + deltaY), (key, data, false))
      .filter({ case (_, seq) => seq.exists { case (_, _, center) => center } })
      .map({ case (sortKey, seq) =>
        val resultKey = seq.filter({ case (_, _, center) => center }).head._1
        val resultValue ={ case (_, data, _) => data }).reduce(_ union _)

        (resultKey, resultValue)

Example 67
Source File: HadoopPointCloudRDD.scala    From geotrellis-pointcloud   with Apache License 2.0 5 votes vote down vote up

import geotrellis.vector.Extent

import io.circe.Json
import io.pdal._
import io.pdal.pipeline._
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

  def apply(path: Path, options: Options = Options.DEFAULT)(implicit sc: SparkContext): RDD[(HadoopPointCloudHeader, List[PointCloud])] = {
    val conf = sc.hadoopConfiguration.withInputDirectory(path, options.filesExtensions)

    options.tmpDir.foreach(PointCloudInputFormat.setTmpDir(conf, _))
    options.dimTypes.foreach(PointCloudInputFormat.setDimTypes(conf, _))
    PointCloudInputFormat.setPipeline(conf, options.pipeline)

    options.filterExtent match {
      case Some(filterExtent) =>
        PointCloudInputFormat.setFilterExtent(conf, filterExtent)

        ).filter { case (header, _) =>

      case None =>
Example 68
Source File: S3PointCloudRDD.scala    From geotrellis-pointcloud   with Apache License 2.0 5 votes vote down vote up

import geotrellis.vector.Extent
import io.circe._
import io.pdal._
import io.pdal.pipeline._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

  def apply(bucket: String, prefix: String, options: Options = Options.DEFAULT)(implicit sc: SparkContext): RDD[(S3PointCloudHeader, List[PointCloud])] = {
    val conf = sc.hadoopConfiguration

    S3InputFormat.setBucket(conf, bucket)
    S3InputFormat.setPrefix(conf, prefix)
    S3InputFormat.setExtensions(conf, options.filesExtensions)
    S3InputFormat.setCreateS3Client(conf, options.getClient)
    options.numPartitions.foreach(S3InputFormat.setPartitionCount(conf, _))
    options.partitionBytes.foreach(S3InputFormat.setPartitionBytes(conf, _))

    options.tmpDir.foreach(PointCloudInputFormat.setTmpDir(conf, _))
    options.dimTypes.foreach(PointCloudInputFormat.setDimTypes(conf, _))
    PointCloudInputFormat.setPipeline(conf, options.pipeline)

    options.filterExtent match {
      case Some(filterExtent) =>
        PointCloudInputFormat.setFilterExtent(conf, filterExtent)

        ).filter { case (header, _) => header.extent3D.exists(_.toExtent.intersects(filterExtent)) }
      case None =>
Example 69
Source File: MlLibOnKudu.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.hadooparchitecturebook.taxi360.etl.machinelearning.kudu

import com.hadooparchitecturebook.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object MlLibOnKudu {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> " +
        "<kuduMaster> " +
        "<taxiTable> " +
        "<numOfCenters> " +
        "<numOfIterations> ")

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val taxiTable = args(2)
    val numOfCenters = args(3).toInt
    val numOfIterations = args(4).toInt

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)

    val sqlContext = new SQLContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> taxiTable,
      "kudu.master" -> kuduMaster)"org.apache.kudu.spark.kudu").load.

    val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => {
      val taxiTrip =

    println("--Running KMeans")
    val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations)
    println(" > vector centers:")
    clusters.clusterCenters.foreach(v => println(" >> " + v))

    println("--Running corr")
    val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson")
    println(" > corr: " + correlMatrix.toString)

    println("--Running colStats")
    val colStats = Statistics.colStats(vectorRDD)
    println(" > max: " + colStats.max)
    println(" > count: " + colStats.count)
    println(" > mean: " + colStats.mean)
    println(" > min: " + colStats.min)
    println(" > normL1: " + colStats.normL1)
    println(" > normL2: " + colStats.normL2)
    println(" > numNonZeros: " + colStats.numNonzeros)
    println(" > variance: " + colStats.variance)

    //Labeled Points
Example 70
Source File: SolRSupport.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.hadooparchitecturebook.taxi360.streaming.ingestion.solr

import{ConnectException, SocketException}
import java.util

import org.apache.solr.client.solrj.impl.CloudSolrServer
import org.apache.solr.client.solrj.request.UpdateRequest
import org.apache.solr.common.{SolrException, SolrInputDocument}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream

object SolRSupport {
  def indexDStreamOfDocs(zkHost:String,
                         docDStream:DStream[SolrInputDocument]): Unit ={
    docDStream.foreachRDD(docRdd => {
      indexDoc(zkHost, collection, batchSize, docRdd)

  def indexDoc(zkHost:String,
               docRdd:RDD[SolrInputDocument]): Unit = {
    docRdd.foreachPartition(it => {
      val solrServer =

      val batch = new util.ArrayList[SolrInputDocument]()

      while (it.hasNext) {
        val inputDoc =
        if (batch.size() >= batchSize)
          sendBatchToSolr(solrServer, collection, batch)
      if (!batch.isEmpty())
        sendBatchToSolr(solrServer, collection, batch)

  def sendBatchToSolr( solrServer: CloudSolrServer,
                       batch:util.Collection[SolrInputDocument]) {
    val req = new UpdateRequest()
    req.setParam("collection", collection)

    try {
    } catch  {
      case e:Exception => {
        if (shouldRetry(e)) {
          try {
          } catch {
            case e1: InterruptedException => {

          try {
          } catch {
            case e1: Exception => {

              if (e1.isInstanceOf[RuntimeException]) {
                throw e1.asInstanceOf[RuntimeException]
              } else {
                throw new RuntimeException(e1)
        } else {
          if (e.isInstanceOf[RuntimeException]) {
            throw e.asInstanceOf[RuntimeException]
          } else {
            throw new RuntimeException(e)
    } finally {

  def shouldRetry( exc:Exception): Boolean = {
    val rootCause = SolrException.getRootCause(exc)
    rootCause.isInstanceOf[ConnectException] ||
Example 71
Source File: HBaseSQLTableScan.scala    From Backup-Repo   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase.execution

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.RangePartitioning
import org.apache.spark.sql.execution.LeafNode
import org.apache.spark.sql.hbase._

case class HBaseSQLTableScan(
                              relation: HBaseRelation,
                              output: Seq[Attribute],
                              result: RDD[Row]) extends LeafNode {
  override def outputPartitioning = {
    var ordering = List[SortOrder]()
    for (key <- relation.partitionKeys) {
      ordering = ordering :+ SortOrder(key, Ascending)
    RangePartitioning(ordering.toSeq, relation.partitions.size)

  override protected def doExecute(): RDD[Row] = result
Example 72
Source File: HBaseShuffledRDD.scala    From Backup-Repo   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import org.apache.spark._
import org.apache.spark.rdd.{RDD, ShuffledRDD, ShuffledRDDPartition}

class HBaseShuffledRDD (
    prevRdd: RDD[(HBaseRawType, Array[HBaseRawType])],
    part: Partitioner,
    @transient hbPartitions: Seq[HBasePartition] = Nil) extends ShuffledRDD(prevRdd, part){

  override def getPartitions: Array[Partition] = {
    if (hbPartitions==null || hbPartitions.isEmpty) {
      Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
    } else {
      // only to be invoked by clients

  override def getPreferredLocations(split: Partition): Seq[String] = {
    if (hbPartitions==null || hbPartitions.isEmpty) {
    } else {
      split.asInstanceOf[HBasePartition] {
Example 73
Source File: RDFS11.scala    From SparkSRE   with Apache License 2.0 5 votes vote down vote up
package com.hj.examples

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object RDFS11 {
  def transitive(rdd:RDD[(String, String)]) = {
    var rddTuple = rdd
    val reverseTuple = => (x._2, x._1))

    var cur = 0L
    var pre = rddTuple.count
    var flag = true
    while (flag) {
      val joined = reverseTuple.join(rddTuple)
      val res = => x._2)
      rddTuple = rddTuple.union(res).distinct
      cur = rddTuple.count
      if(pre == cur) flag = false
      pre = cur

  def main(args: Array[String]): Unit = {
    if(args.length != 2) {
      System.out.println("Arguments are invalid! \nExample: <input_path> <output_path>")
    val inputPath = args(0)
    val outputPath = args(1)

    val conf = new SparkConf().setAppName("RDFS11").setMaster("local[2]")
    val sc = new SparkContext(conf)

    val lines = sc.textFile(inputPath)

    val triples = => {
      val arr = x.split(" ")
      (arr(0), arr(1), arr(2))


    var subClass = triples.filter(x => x._2.equals("rdfs:subClassOf")).map(x => (x._1, x._3))
    subClass = transitive(subClass)

    subClass.foreach(x => println(x))
Example 74
Source File: RDFS5.scala    From SparkSRE   with Apache License 2.0 5 votes vote down vote up
package com.hj.examples

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object RDFS5 {
  def transitive(rdd:RDD[(String, String)]) = {
    var rddTuple = rdd
    val reverseTuple = => (x._2, x._1))

    var cur = 0L
    var pre = rddTuple.count
    var flag = true
    while (flag) {
      val joined = reverseTuple.join(rddTuple)
      val res = => x._2)
      rddTuple = rddTuple.union(res).distinct
      cur = rddTuple.count
      if(pre == cur) flag = false
      pre = cur

  def main(args: Array[String]): Unit = {
    if(args.length != 2) {
      System.out.println("Arguments are invalid! \nExample: <input_path> <output_path>")
    val inputPath = args(0)
    val outputPath = args(1)

    val conf = new SparkConf().setAppName("RDFS5").setMaster("local[2]")
    val sc = new SparkContext(conf)

    val lines = sc.textFile(inputPath)

    val triples = => {
      val arr = x.split(" ")
      (arr(0), arr(1), arr(2))


    var subProp = triples.filter(x => x._2.equals("rdfs:subPropertyOf")).map(x => (x._1, x._3))
    subProp = transitive(subProp)

    subProp.foreach(x => println(x))
Example 75
Source File: DFConverter.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import com.twosigma.flint.rdd.OrderedRDD
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.StructType

object DFConverter {

  def newDataFrame(df: DataFrame): DataFrame = {
    new DataFrame(df.sparkSession, df.logicalPlan, RowEncoder(df.schema))

  def toDataFrame(rdd: OrderedRDD[Long, InternalRow], schema: StructType): DataFrame = {
    val spark = SparkSession.builder().getOrCreate()
    val internalRows = rdd.values
    spark.internalCreateDataFrame(internalRows, schema)

  def toDataFrame(rdd: RDD[InternalRow], schema: StructType): DataFrame = {
    val spark = SparkSession.builder().getOrCreate()
    spark.internalCreateDataFrame(rdd, schema)

Example 76
Source File: WeightedLabeledPoint.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.math.stats.regression

import breeze.linalg.DenseVector
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext

case class WeightedLabeledPoint(label: Double, weight: Double, features: DenseVector[Double]) {
  def generateSampleData(sc: SparkContext, weights: DenseVector[Double], intercept: Double,
    numRows: Long = 100L, numPartitions: Int = 4, errorScalar: Double = 1.0,
    seed: Long = 1L): RDD[WeightedLabeledPoint] = {
    val len = weights.length + 2
    // The last entry will serve as the weight of point and the second last entry will serve
    // as noisy of the label.
    val data = RandomRDDs.normalVectorRDD(sc, numRows, len, numPartitions, seed) { d =>
      val fw = d.toArray
      val x = new DenseVector(fw.dropRight(2))
      WeightedLabeledPoint( + intercept + errorScalar * fw(len - 2),
        Math.abs(fw(len - 1)) + 0.5, x
Example 77
Source File: OLSMultipleLinearRegression.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.math.stats.regression

import org.apache.spark.rdd.RDD
import breeze.linalg.{ DenseMatrix, DenseVector }

object OLSMultipleLinearRegression {

  def regression(input: RDD[WeightedLabeledPoint], intercept: Boolean = true): LinearRegressionModel = {
    // Try to get the number of columns
    val nCols = if (intercept) {
      input.first.features.length + 1
    } else {

    val (xx, xy, swx, srwsl, ssrw, wsl, sw, n, lw) = input.treeAggregate((
      new DenseMatrix[Double](nCols, nCols), // 1. Calculate a k-by-k matrix X^TX.
      new DenseVector[Double](nCols), // 2. Calculate a k-dimension vector X^Ty.
      new DenseVector[Double](nCols), // 3. Calculate a k-dimension vector of weighted sum of X.
      0.0, // 4. Calculate the square root weighted sum of labels.
      0.0, // 5. Calculate the sum of square root of weights.
      0.0, // 6. Calculate the weighted sum of labels.
      0.0, // 7. Calculate the sum of weights.
      0: Long, // 8. Calculate the length of input.
      0.0 // 9. Calculate sum of log weights
      // U is a pair of matrix and vector and v is a WeightedLabeledPoint.
      seqOp = (U, v) => {
      // Append 1.0 at the head for calculating intercept.
      val x = if (intercept) {
        DenseVector.vertcat(DenseVector(1.0), v.features)
      } else {
      val wx = x * v.weight
      val sqrtW = Math sqrt v.weight
      // Unfortunately, breeze.linalg.DenseVector does not support tensor product.
      (U._1 += wx.asDenseMatrix.t * x.asDenseMatrix,
        U._2 += wx * v.label,
        U._3 += wx,
        U._4 + v.label * sqrtW,
        U._5 + sqrtW,
        U._6 + v.label * v.weight,
        U._7 + v.weight,
        U._8 + 1,
        U._9 + math.log(v.weight))
    }, combOp = (U1, U2) => (
      U1._1 += U2._1,
      U1._2 += U2._2,
      U1._3 += U2._3,
      U1._4 + U2._4,
      U1._5 + U2._5,
      U1._6 + U2._6,
      U1._7 + U2._7,
      U1._8 + U2._8,
      U1._9 + U2._9
    LinearRegressionModel(input, intercept, n, (xx + xx.t) :/ 2.0, xy, swx, srwsl, ssrw, wsl, sw, lw)
Example 78
Source File: PartitionsIterator.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd

import grizzled.slf4j.Logger

import org.apache.spark.rdd.RDD
import org.apache.spark.{ Partition, TaskContext }

protected[flint] object PartitionsIterator {
  val logger = Logger(PartitionsIterator.getClass)

  def apply[T](
    rdd: RDD[T],
    partitions: Seq[Partition],
    context: TaskContext,
    preservesPartitionsOrdering: Boolean = false // FIXME: This is a band-aid which should be fixed.
  ): PartitionsIterator[T] = new PartitionsIterator(rdd, partitions, context, preservesPartitionsOrdering)

  def headPartitionIndex: Int = curPart.index
Example 79
Source File: TreeReduce.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd.function.summarize

import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

object TreeReduce {

  def apply[T: ClassTag](
    rdd: RDD[T]
    f: (T, T) => T,
    depth: Int = 2
  ): T = {
    require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.")

    val reducePartition: Iterator[T] => Option[T] = iter => {
      if (iter.hasNext) {
      } else {

    val partiallyReduced = rdd.mapPartitions(it => Iterator(reducePartition(it)))

    val op: (Option[T], Option[T]) => Option[T] = (c, x) => {
      if (c.isDefined && x.isDefined) {
        Some(f(c.get, x.get))
      } else if (c.isDefined) {
      } else if (x.isDefined) {
      } else {

    TreeAggregate(partiallyReduced)(Option.empty[T], op, op, depth).getOrElse(
      sys.error("Empty collection.")

Example 80
Source File: PythonUtils.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd

import com.twosigma.flint.timeseries.{ TimeSeriesRDD, TimeSeriesRDDImpl }
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.types._
import org.apache.spark.sql.{ CatalystTypeConvertersWrapper, Row }

private[rdd] case class SchemaColumnInfo(idx: Int, clazz: Class[_ <: Ordered[_]], dataType: DataType)

case class TimeSeriesRDDWithSchema(rdd: TimeSeriesRDDImpl, schema: StructType)

object PythonUtils {
  def fromUnsortedRDD(
    sc: SparkContext,
    rdd: RDD[Row],
    schema: StructType,
    keyColumn: String
  ): TimeSeriesRDDImpl = {
    val orderedRdd = OrderedRDD.fromRDD(formatRDD[Long](rdd, schema, keyColumn), KeyPartitioningType.UnSorted)
    TimeSeriesRDD.fromOrderedRDD(orderedRdd, schema).asInstanceOf[TimeSeriesRDDImpl]

  def toOrderedRDD(
    rdd: RDD[Row],
    schema: StructType,
    keyColumn: String,
    ranges: Seq[CloseOpen[Long]]
  ): OrderedRDD[Long, InternalRow] = {
    val keyIdx = schema.fieldIndex(keyColumn)
    val converter = CatalystTypeConvertersWrapper.toCatalystRowConverter(schema)
    OrderedRDD.fromRDD( => (row.getAs[Long](keyIdx), converter(row))), ranges)
Example 81
Source File: TimeSeriesRDDConversionSpec.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.timeseries

import java.util.concurrent.TimeUnit

import com.twosigma.flint.timeseries.row.Schema
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{ SQLContext, DataFrame, Row }
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.expressions.{ GenericRowWithSchema => ExternalRow }
import org.scalatest.tagobjects.Slow

class TimeSeriesRDDConversionSpec extends TimeSeriesSuite {

  // The largest prime < 100
  override val defaultPartitionParallelism = 97

  // The 10000-th prime.
  private val defaultNumRows = 104729

  private def createDataFrame(isSorted: Boolean = true)(implicit sqlContext: SQLContext): DataFrame = {
    val n = defaultNumRows
    val schema = Schema("value" -> DoubleType)
    val rdd: RDD[Row] = sqlContext.sparkContext.parallelize(1 to n, defaultPartitionParallelism).map { i =>
      val data: Array[Any] = if (isSorted) {
        Array((i / 100).toLong, i.toDouble)
      } else {
        Array(((i + 1 - n) / 100).toLong, i.toDouble)
      new ExternalRow(data, schema)
    sqlContext.createDataFrame(rdd, schema)

  "TimeSeriesRDD" should "convert from a sorted DataFrame correctly" taggedAs (Slow) in {
    implicit val _sqlContext = sqlContext
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = true))(isSorted = true, TimeUnit.NANOSECONDS)
        assert(tsRdd.count() == defaultNumRows)
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = true))(isSorted = false, TimeUnit.NANOSECONDS)
        assert(tsRdd.count() == defaultNumRows)
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = false))(isSorted = false, TimeUnit.NANOSECONDS)
        assert(tsRdd.count() == defaultNumRows)
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(
          createDataFrame(isSorted = false).sort("time")
            isSorted = true, TimeUnit.NANOSECONDS
        assert(tsRdd.count() == defaultNumRows)
Example 82
Source File: ParallelCollectionRDD.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd

import org.apache.spark.rdd.RDD
import org.apache.spark.{ Partition, SparkContext, TaskContext }

import scala.reflect.ClassTag

case class ParallelCollectionRDDPartition[T: ClassTag](
  override val index: Int,
  values: Seq[T]
) extends Partition

class ParallelCollectionRDD[T: ClassTag](
  sc: SparkContext,
  @transient data: Seq[Seq[T]]
) extends RDD[T](sc, Nil) {
  override def compute(split: Partition, context: TaskContext): Iterator[T] =

  override protected def getPartitions: Array[Partition] = {
      case (d, index) =>
        ParallelCollectionRDDPartition(index, d)
Example 83
Source File: OverlappedOrderedRDDSpec.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd

import com.twosigma.flint.SharedSparkContext
import org.apache.spark.rdd.RDD
import org.scalatest.FlatSpec

class OverlappedOrderedRDDSpec extends FlatSpec with SharedSparkContext {

  val numSlices: Int = 3

  val sliceLength: Int = 4

  var rdd: RDD[(Int, Int)] = _

  var orderedRdd: OrderedRDD[Int, Int] = _

  var overlappedOrderedRdd: OverlappedOrderedRDD[Int, Int] = _

  private def window(t: Int): (Int, Int) = (t - 2, t)

  override def beforeAll() {
    val s = sliceLength
    rdd = sc.parallelize(0 until numSlices, numSlices).flatMap {
      i => (1 to s).map { j => i * s + j }
    }.map { x => (x, x) }
    orderedRdd = OrderedRDD.fromRDD(rdd, KeyPartitioningType.Sorted)
    overlappedOrderedRdd = OverlappedOrderedRDD(orderedRdd, window)

  "The OverlappedOrderedRDD" should "be constructed from `OrderedRDD` correctly" in {
    assert(overlappedOrderedRdd.rangeSplits.deep == orderedRdd.rangeSplits.deep)
    val benchmark = Array(1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 8, 9, 10, 11, 12).map { x => (x, x) }
    assert(overlappedOrderedRdd.collect().deep == benchmark.deep)

  it should "be able to remove overlapped rows to get an `OrderedRDD` correctly" in {
    assert(overlappedOrderedRdd.rangeSplits.deep == orderedRdd.rangeSplits.deep)
    assert(overlappedOrderedRdd.nonOverlapped().collect().deep == orderedRdd.collect().deep)

  it should "`mapPartitionsWithIndexOverlapped` correctly" in {
    val mapped = overlappedOrderedRdd.mapPartitionsWithIndexOverlapped(
      (index, iterator) => { case (k, v) => (k, v * 2) }
    val benchmark = Array(1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 8, 9, 10, 11, 12).map { x => (x, 2 * x) }
    assert(mapped.collect().deep == benchmark.deep)
Example 84
Source File: RDDKafkaWriter.scala    From spark-kafka-writer   with Apache License 2.0 5 votes vote down vote up
package com.github.benfradet.spark.kafka.writer

import org.apache.kafka.clients.producer.{Callback, ProducerRecord}
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

  override def writeToKafka[K, V](
    producerConfig: Map[String, Object],
    transformFunc: T => ProducerRecord[K, V],
    callback: Option[Callback] = None
  ): Unit =
    rdd.foreachPartition { partition =>
      val producer = KafkaProducerCache.getProducer[K, V](producerConfig)
        .foreach(record => producer.send(record, callback.orNull))
Example 85
Source File: DStreamKafkaWriterSpec.scala    From spark-kafka-writer   with Apache License 2.0 5 votes vote down vote up
package com.github.benfradet.spark.kafka.writer

import org.apache.kafka.clients.producer._
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream

import scala.collection.mutable
import scala.concurrent.duration._

class DStreamKafkaWriterSpec extends SKRSpec {

  "a DStreamKafkaWriter" when {
    "given a dstream" should {
      "write its content to Kafka" in {
        val localTopic = topic
        val msgs = (1 to 10).map(_.toString)
        val stream = createDStream(msgs)
          s => new ProducerRecord[String, String](localTopic, s)

        val results = collect(ssc, localTopic)

        eventually(timeout(30.seconds), interval(1.second)) {
          results shouldBe msgs

      "trigger a given callback for every write to Kafka" in {
        val localTopic = topic
        val msgs = (1 to 10).map(_.toString)
        val stream = createDStream(msgs)
          s => new ProducerRecord[String, String](localTopic, s),
          Some(new Callback with Serializable {
            override def onCompletion(metadata: RecordMetadata, exception: Exception): Unit = {

        eventually(timeout(30.seconds), interval(1.second)) {
          SKRSpec.callbackTriggerCount.get() shouldBe msgs.size

  private def createDStream(seq: Seq[String]): DStream[String] = {
    val q = mutable.Queue.empty[RDD[String]]
Example 86
Source File: StreamingExample.scala    From reactiveinflux-spark   with Apache License 2.0 5 votes vote down vote up
package com.pygmalios.reactiveinflux.spark.examples

import com.pygmalios.reactiveinflux._
import com.pygmalios.reactiveinflux.spark._
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.joda.time.DateTime

import scala.concurrent.duration._

object StreamingExample extends App {
  val conf = new SparkConf()
  val ssc = new StreamingContext(conf, Seconds(1))

  val point1 = Point(
    time        =,
    measurement = "measurement1",
    tags        = Map(
      "tagKey1" -> "tagValue1",
      "tagKey2" -> "tagValue2"),
    fields      = Map(
      "fieldKey1" -> "fieldValue1",
      "fieldKey2" -> 10.7)

  // Provide settings for reactiveinflux
  implicit val params = ReactiveInfluxDbName("example")
  implicit val awaitAtMost = 1.second

  // Create DStream of Influx points
  val queue = new scala.collection.mutable.Queue[RDD[Point]]
  val queueStream: DStream[Point] = ssc.queueStream(queue)

  // Add single RDD with a single Influx point to the DStream

  // Save DStream to Influx

  // Start Spark streaming
Example 87
Source File: Example.scala    From reactiveinflux-spark   with Apache License 2.0 5 votes vote down vote up
package com.pygmalios.reactiveinflux.spark.examples

import com.pygmalios.reactiveinflux._
import com.pygmalios.reactiveinflux.spark._
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.joda.time.DateTime

import scala.concurrent.duration._

object Example extends App {
  val conf = new SparkConf()
  val sc = new SparkContext(conf)

  val point1 = Point(
    time        =,
    measurement = "measurement1",
    tags        = Map(
      "tagKey1" -> "tagValue1",
      "tagKey2" -> "tagValue2"),
    fields      = Map(
      "fieldKey1" -> "fieldValue1",
      "fieldKey2" -> 10.7)

  // Provide settings for reactiveinflux
  implicit val params = ReactiveInfluxDbName("example")
  implicit val awaitAtMost = 1.second

  // Create RDD with Influx point
  val rdd: RDD[Point] = sc.parallelize(Seq(point1))

  // Save RDD to Influx

  // Stop Spark context
Example 88
Source File: PointRDDExtensions.scala    From reactiveinflux-spark   with Apache License 2.0 5 votes vote down vote up
package com.pygmalios.reactiveinflux.spark.extensions

import com.pygmalios.reactiveinflux.spark.config.ReactiveInfluxSparkConfig
import com.pygmalios.reactiveinflux.spark.{RDDExtensions, _}
import com.pygmalios.reactiveinflux.{PointNoTime, ReactiveInfluxDbName}
import org.apache.spark.rdd.RDD
import org.slf4j.LoggerFactory

import scala.concurrent.duration.Duration

private[spark] class PointRDDExtensions[+T <: PointNoTime](rdd: RDD[T]) extends RDDExtensions[T] {
  import PointRDDExtensions._

  override def saveToInflux()(implicit reactiveInfluxDbName: ReactiveInfluxDbName,
                              awaitAtMost: Duration): Unit = {
    // Process each partition separately
    totalBatchCount = 0
    totalPointCount = 0
    rdd.foreachPartition { partition =>
      withInflux { db =>
        val batchSize = ReactiveInfluxSparkConfig(db.config).sparkBatchSize

        // Write points in batches
        var batchCount = 0
        var pointCount = 0
        partition.sliding(batchSize, batchSize).foreach { batch =>
          // Write single batch

          // Statistics for logging
          batchCount += 1
          pointCount += batch.size

        totalBatchCount += batchCount
        totalPointCount += pointCount

        log.debug(s"Partition with $pointCount points written to Influx in $batchCount batches.")
    }"RDD with ${rdd.partitions.size} partitions and $totalPointCount points written to Influx in $totalBatchCount batches.")

object PointRDDExtensions {
  private val log = LoggerFactory.getLogger(classOf[PointRDDExtensions[_]])

  // This makes sense for testing purposes only
  private[reactiveinflux] var totalBatchCount = 0
  private[reactiveinflux] var totalPointCount = 0
Example 89
package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.PrefixSpan
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object MSNBCStreamingExample extends App {

    val conf = new SparkConf()
      .setAppName("MSNBC data initial streaming example")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, batchDuration = Seconds(10))

    val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)
    val trainSequences: RDD[Array[Array[Int]]] =
    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel =
    val freqSequences =

    val rawSequences: DStream[String] = ssc.socketTextStream("localhost", 9999)

    val sequences: DStream[Array[Array[Int]]] = rawSequences
      .map(line => line.split(" ").map(_.toInt))

    print(">>> Analysing new batch of data")
      rdd => rdd.foreach(
        array => {
          println(">>> Sequence: ")
          println("[", ", ", "]")).mkString("[", ", ", "]"))
          freqSequences.count(_.deep == array.deep) match {
            case count if count > 0 => println("is frequent!")
            case _ => println("is not frequent.")
    print(">>> done")


Example 90
package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.{FPGrowth, PrefixSpan}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object MSNBCPatternMining extends App {

    val conf = new SparkConf()
      .setAppName(" data pattern mining")
    val sc = new SparkContext(conf)

    val transactionTest = sc.parallelize(Array(Array("A", "B", "C"), Array("B", "C", "A")))
    val fp = new FPGrowth().setMinSupport(0.8).setNumPartitions(5)

    val transactions: RDD[Array[Int]] = sc.textFile("./msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)

    // NOTE: Caching data is recommended
    val uniqueTransactions: RDD[Array[Int]] =

    val fpGrowth = new FPGrowth().setMinSupport(0.01)
    val model =
    val count = uniqueTransactions.count()

    model.freqItemsets.collect().foreach { itemset =>
      if (itemset.items.length >= 3)
        println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq / count.toDouble )

    val rules = model.generateAssociationRules(confidence = 0.4)
    rules.collect().foreach { rule =>
      println("[" + rule.antecedent.mkString(",") + "=>"
        + rule.consequent.mkString(",") + "]," + (100 * rule.confidence).round / 100.0)

    val frontPageConseqRules = rules.filter(_.consequent.head == 1)

    val sequences: RDD[Array[Array[Int]]] =

    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel = => (fs.sequence.length, 1))
      .reduceByKey(_ + _)
      .foreach(fs => println(s"${fs._1}: ${fs._2}"))

      .map(fs => (fs.sequence.length, fs))
      .map(group => group._2.reduce((f1, f2) => if (f1.freq > f2.freq) f1 else f2))
      .map("[", ", ", "]")).mkString("[", ", ", "]"))

      .map(fs => ("[", ", ", "]")).mkString("[", ", ", "]"), 1))
      .reduceByKey(_ + _)
      .reduce( (f1, f2) => if (f1._2 > f2._2) f1 else f2 )

    psModel.freqSequences.reduce( (f1, f2) => if (f1.freq > f2.freq) f1 else f2 )
    psModel.freqSequences.filter(_.sequence.length == 1).map(_.sequence.toString).collect.foreach(println)

    psModel.freqSequences.collect().foreach {
      freqSequence =>
"[", ", ", "]")).mkString("[", ", ", "]") + ", " + freqSequence.freq
Example 91
package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.PrefixSpan
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object MSNBCStreamingAdvanced extends App {

    val conf = new SparkConf()
      .setAppName("MSNBC data initial streaming example")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, batchDuration = Seconds(10))

    val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)
    val trainSequences: RDD[Array[Array[Int]]] =
    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel =
    val freqSequences =

    val rawEvents: DStream[String] = ssc.socketTextStream("localhost", 9999)

    val events: DStream[(Int, String)] = => line.split(": "))
        .map(kv => (kv(0).toInt, kv(1)))

    val countIds = => (e._1, 1))
    val counts: DStream[(Int, Int)] = countIds.reduceByKey(_ + _)

    def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {
      Some(runningCount.getOrElse(0) + newValues.sum)
    val runningCounts = countIds.updateStateByKey[Int](updateFunction _)

    val duration = Seconds(20)
    val slide = Seconds(10)

    val rawSequences: DStream[(Int, String)] = events
      .reduceByKeyAndWindow((v1: String, v2: String) => v1 + " " + v2, duration, slide)

    val sequences: DStream[Array[Array[Int]]] =
      .map(line => line.split(" ").map(_.toInt))

    print(">>> Analysing new batch of data")
      rdd => rdd.foreach(
        array => {
          println(">>> Sequence: ")
          println("[", ", ", "]")).mkString("[", ", ", "]"))
          freqSequences.count(_.deep == array.deep) match {
            case count if count > 0 => println("is frequent!")
            case _ => println("is not frequent.")
    print(">>> done")

Example 92
Source File: GraphFromRdd.scala    From Mastering-Machine-Learning-with-Spark-2.x   with MIT License 5 votes vote down vote up
package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object GraphFromRdd extends App {

     val conf = new SparkConf()
       .setAppName("RDD graph")
     val sc = new SparkContext(conf)

     val vertices: RDD[(VertexId, String)] = sc.parallelize(
       Array((1L, "Anne"),
         (2L, "Bernie"),
         (3L, "Chris"),
         (4L, "Don"),
         (5L, "Edgar")))

     val edges: RDD[Edge[String]] = sc.parallelize(
       Array(Edge(1L, 2L, "likes"),
         Edge(2L, 3L, "trusts"),
         Edge(3L, 4L, "believes"),
         Edge(4L, 5L, "worships"),
         Edge(1L, 3L, "loves"),
         Edge(4L, 1L, "dislikes")))

     val friendGraph: Graph[String, String] = Graph(vertices, edges)
     friendGraph.vertices.collect.foreach(println) e => e.srcId > e.dstId ).count()

     val mappedEdgeGraph: Graph[String, Boolean] = friendGraph.mapEdges( e => e.srcId > e.dstId )

     val inDegVertexRdd: VertexRDD[Int] = friendGraph.aggregateMessages[Int](
       sendMsg = ec => ec.sendToDst(1),
       mergeMsg = (msg1, msg2) => msg1+msg2
     assert(inDegVertexRdd.collect.deep == friendGraph.inDegrees.collect.deep)

     friendGraph.staticPageRank(numIter = 10).vertices.collect.foreach(println)
     friendGraph.pageRank(tol = 0.0001, resetProb = 0.15)

Example 93
package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
//import org.graphframes._

object GraphFramesExample extends App {

    val conf = new SparkConf()
      .setAppName("RDD graph")
    val sc = new SparkContext(conf)

    val vertices: RDD[(VertexId, String)] = sc.parallelize(
      Array((1L, "Anne"),
        (2L, "Bernie"),
        (3L, "Chris"),
        (4L, "Don"),
        (5L, "Edgar")))

    val edges: RDD[Edge[String]] = sc.parallelize(
      Array(Edge(1L, 2L, "likes"),
        Edge(2L, 3L, "trusts"),
        Edge(3L, 4L, "believes"),
        Edge(4L, 5L, "worships"),
        Edge(1L, 3L, "loves"),
        Edge(4L, 1L, "dislikes")))

    val friendGraph: Graph[String, String] = Graph(vertices, edges)

//    val friendGraphFrame = GraphFrame.fromGraphX(friendGraph)
//    friendGraphFrame.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3)").filter(
//      "e1.attr = 'trusts' OR v3.attr = 'Chris'"
//    ).collect.foreach(println)

Example 94
Source File: GephiApp.scala    From Mastering-Machine-Learning-with-Spark-2.x   with MIT License 5 votes vote down vote up
package com.github.maxpumperla.ml_spark.graphs


import com.github.maxpumperla.ml_spark.utils.Gephi.toGexf
import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

object GephiApp extends App {

    val conf = new SparkConf()
      .setAppName("Gephi Test Writer")
    val sc = new SparkContext(conf)

    val vertices: RDD[(VertexId, String)] = sc.parallelize(
      Array((1L, "Anne"),
        (2L, "Bernie"),
        (3L, "Chris"),
        (4L, "Don"),
        (5L, "Edgar")))

    val edges: RDD[Edge[String]] = sc.parallelize(
      Array(Edge(1L, 2L, "likes"),
        Edge(2L, 3L, "trusts"),
        Edge(3L, 4L, "believes"),
        Edge(4L, 5L, "worships"),
        Edge(1L, 3L, "loves"),
        Edge(4L, 1L, "dislikes")))

    val graph: Graph[String, String] = Graph(vertices, edges)

    val pw = new PrintWriter("./graph.gexf")
Example 95
Source File: DCollectionGenProperties.scala    From kontextfrei   with Apache License 2.0 5 votes vote down vote up
package com.danielwestheide.kontextfrei.scalatest

import org.apache.spark.rdd.RDD
import org.scalatest.PropSpecLike
import org.scalatest.prop.GeneratorDrivenPropertyChecks

trait DCollectionGenProperties[DColl[_]]
    extends PropSpecLike
    with GeneratorDrivenPropertyChecks
    with DCollectionGen
    with KontextfreiSpec[DColl] {

  property("Can get arbitrary DCollections") {
    forAll { xs: DColl[String] =>
      ops.count(xs) === ops.collectAsArray(xs).length


class DCollectionGenStreamSpec
    extends DCollectionGenProperties[Stream]
    with StreamSpec
class DCollectionGenRDDSpec extends DCollectionGenProperties[RDD] with RDDSpec 
Example 96
Source File: CollectingInstancesProperties.scala    From kontextfrei   with Apache License 2.0 5 votes vote down vote up
package com.danielwestheide.kontextfrei.scalatest

import org.apache.spark.rdd.RDD
import org.scalatest.enablers.Collecting
import org.scalatest.{Inspectors, PropSpec, PropSpecLike}
import org.scalatest.prop.GeneratorDrivenPropertyChecks

trait CollectingInstancesProperties[DColl[_]]
    extends PropSpecLike
    with GeneratorDrivenPropertyChecks
    with KontextfreiSpec[DColl]
    with CollectingInstances {

  property("There is a Collecting instance for DCollection") {
    forAll { (xs: List[String]) =>
      val dcoll = ops.unit(xs)
      Inspectors.forAll(dcoll) { x =>

    "Collecting nature of DCollection returns the original size of the input sequence") {
    forAll { (xs: List[String]) =>
      val dcoll = ops.unit(xs)
        implicitly[Collecting[String, DColl[String]]]
          .sizeOf(dcoll) === xs.size)

    "Collecting nature of DCollection returns the Some loneElement if input sequence has exactly one element") {
    forAll { (x: String) =>
      val dcoll = ops.unit(List(x))
        implicitly[Collecting[String, DColl[String]]]
          .loneElementOf(dcoll) === Some(x))

    "Collecting nature of DCollection returns the None as loneElement if input sequence as more than one element") {
    forAll { (xs: List[String]) =>
      whenever(xs.size > 1) {
        val dcoll = ops.unit(xs)
          implicitly[Collecting[String, DColl[String]]]

    "Collecting nature of DCollection returns the None as loneElement if input sequence is empty") {
    val dcoll = ops.unit(List.empty[String])
      implicitly[Collecting[String, DColl[String]]]


class CollectionInstancesStreamSpec
    extends CollectingInstancesProperties[Stream]
    with StreamSpec

class CollectionInstancesRDDSpec
    extends CollectingInstancesProperties[RDD]
    with RDDSpec 
Example 97
Source File: RDDPairFunctions.scala    From kontextfrei   with Apache License 2.0 5 votes vote down vote up
package com.danielwestheide.kontextfrei.rdd

import com.danielwestheide.kontextfrei.DCollectionPairFunctions
import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.reflect.ClassTag

private[kontextfrei] trait RDDPairFunctions
    extends DCollectionPairFunctions[RDD] { this: RDDBase =>

  override final def cogroup[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Iterable[B], Iterable[C]))] = withSite(x) {

  override final def values[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[B] = withSite(x) {

  override final def keys[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[A] = withSite(x) {

  override final def leftOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (B, Option[C]))] = withSite(x) {

  override final def rightOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], C))] = withSite(x) {

  override final def fullOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], Option[C]))] = withSite(x) {

  override final def mapValues[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(f: B => C): RDD[(A, C)] = withSite(x) {

  override final def flatMapValues[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(f: B => TraversableOnce[C]): RDD[(A, C)] = withSite(x) {

  override final def reduceByKey[A: ClassTag, B: ClassTag](xs: RDD[(A, B)])(
      f: (B, B) => B): RDD[(A, B)] = withSite(xs) {

  override final def foldByKey[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)])(zeroValue: B, f: (B, B) => B): RDD[(A, B)] = withSite(xs) {

  override final def aggregateByKey[A: ClassTag, B: ClassTag, C: ClassTag](
      xs: RDD[(A, B)])(zeroValue: C)(seqOp: (C, B) => C,
                                     combOp: (C, C) => C): RDD[(A, C)] = withSite(xs) {
    _.aggregateByKey(zeroValue)(seqOp, combOp)

  override final def combineByKey[A: ClassTag, B: ClassTag, C: ClassTag](
      xs: RDD[(A, B)])(createCombiner: B => C)(
      mergeValue: (C, B) => C,
      mergeCombiners: (C, C) => C): RDD[(A, C)] = withSite(xs) {
    _.combineByKey(createCombiner, mergeValue, mergeCombiners)

  override final def countByKey[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)]): Map[A, Long] = withSite(xs) {

  override final def collectAsMap[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)]): Map[A, B] = withSite(xs) {

  override final def partitionBy[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)])(partitioner: Partitioner): RDD[(A, B)] = withSite(xs) {
Example 98
Source File: RDDOrderedFunctions.scala    From kontextfrei   with Apache License 2.0 5 votes vote down vote up
package com.danielwestheide.kontextfrei.rdd
import com.danielwestheide.kontextfrei.DCollectionOrderedFunctions
import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

private[kontextfrei] trait RDDOrderedFunctions
    extends DCollectionOrderedFunctions[RDD] { this: RDDBase =>

  override final def sortByKey[A: ClassTag: Ordering, B: ClassTag](
      x: RDD[(A, B)])(ascending: Boolean): RDD[(A, B)] = withSite(x) {

  override final def sortByKeyWithNumPartitions[A: ClassTag: Ordering,
                                                B: ClassTag](
      x: RDD[(A, B)])(ascending: Boolean, numPartitions: Int): RDD[(A, B)] = withSite(x) {
    _.sortByKey(ascending, numPartitions)

  override final def filterByRange[A: ClassTag: Ordering, B: ClassTag](
      x: RDD[(A, B)])(lower: A, upper: A): RDD[(A, B)] = withSite(x) {
    _.filterByRange(lower, upper)

  override def repartitionAndSortWithinPartitions[
      A: ClassTag: Ordering,
      B: ClassTag](
      x: RDD[(A, B)])(
      partitioner: Partitioner)
    : RDD[(A, B)] = withSite(x) {
Example 99
Source File: RDDCollectionOpsSpec.scala    From kontextfrei   with Apache License 2.0 5 votes vote down vote up
package com.danielwestheide.kontextfrei

import com.danielwestheide.kontextfrei.rdd.RDDOpsSupport
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.scalatest.BeforeAndAfterAll

class RDDCollectionOpsSpec
    extends DCollectionOpsProperties[RDD]
    with BeforeAndAfterAll {
  implicit val sparkContext = new SparkContext("local[2]", "dcollection-spec")
  override implicit val ops: DCollectionOps[RDD] =
  override protected def afterAll(): Unit = {
Example 100
Source File: TSNEHelper.scala    From spark-tsne   with Apache License 2.0 5 votes vote down vote up
package com.github.saurfang.spark.tsne

import breeze.linalg._
import breeze.stats._
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.rdd.RDD

object TSNEHelper {
  // p_ij = (p_{i|j} + p_{j|i}) / 2n
  def computeP(p_ji: CoordinateMatrix, n: Int): RDD[(Int, Iterable[(Int, Double)])] = {
      .flatMap(e => Seq(
      ((e.i.toInt, e.j.toInt), e.value),
      ((e.j.toInt, e.i.toInt), e.value)
      .reduceByKey(_ + _) // p + p'
      .map{case ((i, j), v) => (i, (j, math.max(v / 2 / n, 1e-12))) } // p / 2n

  def update(Y: DenseMatrix[Double],
             dY: DenseMatrix[Double],
             iY: DenseMatrix[Double],
             gains: DenseMatrix[Double],
             iteration: Int,
             param: TSNEParam): DenseMatrix[Double] = {
    import param._
    val momentum = if (iteration <= t_momentum) initial_momentum else final_momentum
    gains.foreachPair {
      case ((i, j), old_gain) =>
        val new_gain = math.max(min_gain,
          if ((dY(i, j) > 0.0) != (iY(i, j) > 0.0))
            old_gain + 0.2
            old_gain * 0.8
        gains.update(i, j, new_gain)

        val new_iY = momentum * iY(i, j) - eta * new_gain * dY(i, j)
        iY.update(i, j, new_iY)

        Y.update(i, j, Y(i, j) + new_iY) // Y += iY
    val t_Y: DenseVector[Double] = mean(Y(::, *)).t
    val y_sub = Y(*, ::)
    Y := y_sub - t_Y
Example 101
Source File: LocalRunner.scala    From spark-betweenness   with Apache License 2.0 5 votes vote down vote up
package com.centrality.kBC

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD

object MainRunner 
  def main(args: Array[String])
    // Create spark context
    val appName="kBC"
    val sparkMode="local"
    val conf = new SparkConf().setAppName(appName).setMaster(sparkMode);
    val sc = new SparkContext(conf);
    // Create sample graph
    // Create an RDD for vertices
    val users: RDD[(VertexId, (String, String))] =
    sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                         (5L, ("franklin", "prof")), (2L, ("istoica", "prof"))))
    // Create an RDD for edges
    val relationships: RDD[Edge[String]] =
      sc.parallelize(Array(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
                           Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi")))
    // Define a default user in case there are relationship with missing user
    val defaultUser = ("John Doe", "Missing")
    // Build the initial Graph
    val graph = Graph(users, relationships, defaultUser)
    val kBCGraph =, 3)
Example 102
Source File: TiRDD.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.tispark

import com.pingcap.tikv._
import com.pingcap.tikv.exception.TiInternalException
import com.pingcap.tikv.meta.TiDAGRequest
import com.pingcap.tikv.types.Converter
import com.pingcap.tikv.util.RangeSplitter
import com.pingcap.tikv.util.RangeSplitter.RegionTask
import com.pingcap.tispark.{TiPartition, TiTableReference}
import org.apache.spark.Partition
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow

import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.collection.mutable.ListBuffer

abstract class TiRDD(
    val dagRequest: TiDAGRequest,
    val physicalId: Long,
    val tiConf: TiConfiguration,
    val tableRef: TiTableReference,
    @transient private val session: TiSession,
    @transient private val sparkSession: SparkSession)
    extends RDD[InternalRow](sparkSession.sparkContext, Nil) {

  private lazy val partitionPerSplit = tiConf.getPartitionPerSplit

  protected def checkTimezone(): Unit = {
    if (!tiConf.getLocalTimeZone.equals(Converter.getLocalTimezone)) {
      throw new TiInternalException(
        "timezone are different! driver: " + tiConf.getLocalTimeZone + " executor:" + Converter.getLocalTimezone +
          " please set user.timezone in spark.driver.extraJavaOptions and spark.executor.extraJavaOptions")

  override protected def getPartitions: Array[Partition] = {
    val keyWithRegionTasks = RangeSplitter
      .splitRangeByRegion(dagRequest.getRangesByPhysicalId(physicalId), dagRequest.getStoreType)

    val hostTasksMap = new mutable.HashMap[String, mutable.Set[RegionTask]]
      with mutable.MultiMap[String, RegionTask]

    var index = 0
    val result = new ListBuffer[TiPartition]
    for (task <- keyWithRegionTasks) {
      hostTasksMap.addBinding(task.getHost, task)
      val tasks = hostTasksMap(task.getHost)
      if (tasks.size >= partitionPerSplit) {
        result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId))
        index += 1

    // add rest
    for (tasks <- hostTasksMap.values) {
      result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId))
      index += 1

  override protected def getPreferredLocations(split: Partition): Seq[String] =
    split.asInstanceOf[TiPartition].tasks.head.getHost :: Nil
Example 103
Source File: BasicDataSourceSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.datasource

import com.pingcap.tikv.exception.TiBatchWriteException
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class BasicDataSourceSuite extends BaseDataSourceTest("test_datasource_basic") {
  private val row1 = Row(null, "Hello")
  private val row2 = Row(2, "TiDB")
  private val row3 = Row(3, "Spark")
  private val row4 = Row(4, null)

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  override def beforeAll(): Unit = {

    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(null, 'Hello'), (2, 'TiDB')")

  test("Test Select") {
    if (!supportBatchWrite) {

    testTiDBSelect(Seq(row1, row2))

  test("Test Write Append") {
    if (!supportBatchWrite) {

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

      .option("database", database)
      .option("table", table)

    testTiDBSelect(Seq(row1, row2, row3, row4))

  test("Test Write Overwrite") {
    if (!supportBatchWrite) {

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

    val caught = intercept[TiBatchWriteException] {
        .option("database", database)
        .option("table", table)

        .equals("SaveMode: Overwrite is not supported. TiSpark only support SaveMode.Append."))

  override def afterAll(): Unit =
    try {
    } finally {
Example 104
Source File: UpperCaseColumnNameSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class UpperCaseColumnNameSuite
    extends BaseDataSourceTest("test_datasource_uppser_case_column_name") {

  private val row1 = Row(1, 2)

  private val schema = StructType(
    List(StructField("O_ORDERKEY", IntegerType), StructField("O_CUSTKEY", IntegerType)))

  override def beforeAll(): Unit = {

                  |CREATE TABLE $dbtable (O_ORDERKEY INTEGER NOT NULL,
                  |                       O_CUSTKEY INTEGER NOT NULL);

  test("Test insert upper case column name") {
    if (!supportBatchWrite) {

    val data: RDD[Row] = sc.makeRDD(List(row1))
    val df = sqlContext.createDataFrame(data, schema)
      .option("database", database)
      .option("table", table)

  override def afterAll(): Unit =
    try {
    } finally {
Example 105
Source File: MissingParameterSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class MissingParameterSuite extends BaseDataSourceTest("test_datasource_missing_parameter") {
  private val row1 = Row(null, "Hello")

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  test("Missing parameter: database") {
    if (!supportBatchWrite) {

    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")

    val caught = intercept[IllegalArgumentException] {
      val rows = row1 :: Nil
      val data: RDD[Row] = sc.makeRDD(rows)
      val df = sqlContext.createDataFrame(data, schema)
        .option("table", table)
        .equals("requirement failed: Option 'database' is required."))

  override def afterAll(): Unit =
    try {
    } finally {
Example 106
Source File: OnlyOnePkSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class OnlyOnePkSuite extends BaseDataSourceTest("test_datasource_only_one_pk") {
  private val row3 = Row(3)
  private val row4 = Row(4)

  private val schema = StructType(List(StructField("i", IntegerType)))

  override def beforeAll(): Unit = {

    jdbcUpdate(s"create table $dbtable(i int primary key)")

  test("Test Write Append") {
    if (!supportBatchWrite) {

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

      .option("database", database)
      .option("table", table)

    testTiDBSelect(Seq(row3, row4))

  override def afterAll(): Unit =
    try {
    } finally {
Example 107
Source File: WriteDDLConflictSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.concurrency

import com.pingcap.tikv.exception.TiBatchWriteException
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

class WriteDDLConflictSuite extends ConcurrencyTest {
  test("write ddl conflict using TableLock") {
    if (!supportBatchWrite) {

    if (!isEnableTableLock) {

    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    doBatchWriteInBackground(Map("useTableLock" -> "true"))


    val caught = intercept[java.sql.SQLException] {
      jdbcUpdate(s"alter table $dbtable ADD Email varchar(255)")
        .startsWith("Table 'test_concurrency_write_read' was locked in WRITE LOCAL by server"))

  test("write ddl conflict using SchemaVersionCheck") {
    if (!supportBatchWrite) {

    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    new Thread(new Runnable {
      override def run(): Unit = {
        jdbcUpdate(s"alter table $dbtable ADD Email varchar(255)")

    val caught = intercept[TiBatchWriteException] {
      val data: RDD[Row] = sc.makeRDD(List(row1, row2, row3))
      val df = sqlContext.createDataFrame(data, schema)
        .option("database", database)
        .option("table", table)
        .option("sleepAfterPrewriteSecondaryKey", sleepBeforeQuery * 2)
        .option("useTableLock", "false")

    assert(caught.getMessage.equals("schema has changed during prewrite!"))
Example 108
Source File: WriteDDLNotConflictSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.concurrency

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

class WriteDDLNotConflictSuite extends ConcurrencyTest {
  test("ddl after GetCommitTS: add column") {
    doTest(s"alter table $dbtable ADD Email varchar(255)")

  test("ddl after GetCommitTS: delete column") {
    doTest(s"alter table $dbtable drop column s")

  test("ddl after GetCommitTS: rename column") {
    doTest(s"alter table $dbtable CHANGE s s2 varchar(128)")

  test("ddl after GetCommitTS: change column type") {
    doTest(s"alter table $dbtable CHANGE i i BIGINT")

  private def doTest(ddl: String): Unit = {
    if (!supportBatchWrite) {

    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    new Thread(new Runnable {
      override def run(): Unit = {

    val data: RDD[Row] = sc.makeRDD(List(row1, row2, row3))
    val df = sqlContext.createDataFrame(data, schema)
      .option("database", database)
      .option("table", table)
      .option("sleepAfterGetCommitTS", sleepBeforeQuery * 2)
      .option("useTableLock", "false")

Source File: WriteWriteConflictSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.concurrency

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

class WriteWriteConflictSuite extends ConcurrencyTest {
  test("write write conflict using TableLock & jdbc") {
    if (!supportBatchWrite) {

    if (!isEnableTableLock) {

    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    doBatchWriteInBackground(Map("useTableLock" -> "true"))


    val caught = intercept[java.sql.SQLException] {
      jdbcUpdate(s"insert into $dbtable values(5, 'test')")
        .startsWith("Table 'test_concurrency_write_read' was locked in WRITE LOCAL by server"))

  test("write write conflict using TableLock & tispark") {
    if (!supportBatchWrite) {

    if (!isEnableTableLock) {

    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    doBatchWriteInBackground(Map("useTableLock" -> "true"))


    val caught = intercept[java.sql.SQLException] {
      val data: RDD[Row] = sc.makeRDD(List(row5))
      val df = sqlContext.createDataFrame(data, schema)
        .option("database", database)
        .option("table", table)
        .option("useTableLock", "true")
        .startsWith("Table 'test_concurrency_write_read' was locked in WRITE LOCAL by server"))
Source File: LockTimeoutSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.ttl

import com.pingcap.tikv.TTLManager
import com.pingcap.tikv.exception.GrpcException
import com.pingcap.tispark.datasource.BaseDataSourceTest
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class LockTimeoutSuite extends BaseDataSourceTest("test_lock_timeout") {
  private val row1 = Row(1, "Hello")

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  override def beforeAll(): Unit = {
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")

  test("Test Lock TTL Timeout") {
    if (!supportTTLUpdate) {

    val seconds = 1000
    val sleep1 = TTLManager.MANAGED_LOCK_TTL + 10 * seconds
    val sleep2 = TTLManager.MANAGED_LOCK_TTL + 15 * seconds

    val data: RDD[Row] = sc.makeRDD(List(row1))
    val df = sqlContext.createDataFrame(data, schema)

    new Thread(new Runnable {
      override def run(): Unit = {
        queryTiDBViaJDBC(s"select * from $dbtable")

    val grpcException = intercept[GrpcException] {
        .option("database", database)
        .option("table", table)
        .option("sleepAfterPrewritePrimaryKey", sleep2)

    assert(grpcException.getMessage.equals("retry is exhausted."))
    assert(grpcException.getCause.getMessage.startsWith("Txn commit primary key failed"))
        "Key exception occurred and the reason is retryable: \"Txn(Mvcc(TxnLockNotFound"))

  override def afterAll(): Unit =
    try {
    } finally {
Source File: EmployeeRelationship.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples.graphx

import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.rdd.RDD
import org.apache.spark.graphx.{ Edge, Graph }

object EmployeeRelationship {
	def main(args: Array[String]): Unit = {
		// vertex format: vertex_id, data
		val vertexArray = Array(
			(1L, ("John", "Software Developer")),
			(2L, ("Robert", "Technical Leader")),
			(3L, ("Charlie", "Software Architect")),
			(4L, ("David", "Software Developer")),
			(5L, ("Edward", "Software Development Manager")),
			(6L, ("Francesca", "Software Development Manager")))

		// edge format: from_vertex_id, to_vertex_id, data
		val edgeArray = Array(
			Edge(2L, 1L, "Technical Mentor"),
			Edge(2L, 4L, "Technical Mentor"),
			Edge(3L, 2L, "Collaborator"),
			Edge(6L, 3L, "Team Member"),
			Edge(4L, 1L, "Peers"),
			Edge(5L, 2L, "Team Member"),
			Edge(5L, 3L, "Team Member"),
			Edge(5L, 6L, "Peers"))

		val sc = new SparkContext(new SparkConf().setAppName("EmployeeRelationshipJob"))

		val vertexRDD: RDD[(Long, (String, String))] = sc.parallelize(vertexArray)

		val edgeRDD: RDD[Edge[String]] = sc.parallelize(edgeArray)

		val graph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD)

		// Vanilla query
		println(">>> Showing the names of people who are Software Developers")
		graph.vertices.filter { case (id, (name, designation)) => designation.equals("Software Developer") }
			.foreach { case (id, (name, designation)) => println(s"... Name: $name, Designation: $designation") }

		// Connection analysis
		println(">>> People connected to Robert (Technical Leader) -> ")
		graph.triplets.filter(_.srcId == 2).collect()
			.foreach { item => println("... " + item.dstAttr._1 + ", " + item.dstAttr._2) }

		println(">>> Robert (Technical Leader) connected to -> ")
		graph.triplets.filter(_.dstId == 2).collect()
			.foreach { item => println("... " + item.srcAttr._1 + ", " + item.srcAttr._2) }

		println(">>> Technical Mentoring Analysis -> ")
		graph.triplets.filter(_.attr.equals("Technical Mentor")).collect()
			.foreach { item => println("... " + item.srcAttr._1 + " mentoring " + item.dstAttr._1) }
Source File: PurchaseLogAnalysis.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples

import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.rdd.RDD

object PurchaseLogAnalysis {
	def main(args: Array[String]): Unit = {

		val ctx = new SparkContext(new SparkConf().setAppName("PurchaseAnalysisJob"))

		val badPkts = ctx.accumulator(0, "Bad Packets")
		val zeroValueSales = ctx.accumulator(0, "Zero Value Sales")
		val missingFields = ctx.accumulator(0, "Missing Fields")
		val blankLines = ctx.accumulator(0, "Blank Lines")

		ctx.textFile("file:/media/linux-1/spark-dev/data/purchases.log", 4)
			.foreach { line =>

				if (line.length() == 0) blankLines += 1
				else if (line.contains("Bad data packet")) badPkts += 1
				else {
					val fields = line.split("\t")

					if (fields.length != 4) missingFields += 1
					else if (fields(3).toFloat == 0) zeroValueSales += 1

		println("Purchase Log Analysis Counters:")
		println(s"\tBad Data Packets=${badPkts.value}")
		println(s"\tZero Value Sales=${zeroValueSales.value}")
		println(s"\tMissing Fields=${missingFields.value}")
		println(s"\tBlank Lines=${blankLines.value}")
Source File: TestBroadcastVariables.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples

import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

import scala.util.{ Try, Success, Failure }
import scala.collection.mutable.Map

	def loadCSVFile(filename: String): Option[Map[String, String]] = {
		val countries = Map[String, String]()

		Try {
			val bufferedSource = Source.fromFile(filename)

			for (line <- bufferedSource.getLines) {
				val Array(country, capital) = line.split(",").map(_.trim)
				countries += country -> capital

			return Some(countries)

Source File: TestAccumulators.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples

import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.rdd.RDD

		rdd.foreach { line =>
			if (line.length() > 0) totalLines += 1
			if (line.startsWith("error:")) errorLines += 1
			else if (line.startsWith("info:")) infoLines += 1
			else if (line.startsWith("warn:")) warnLines += 1

		println(s">>> [Using Accumulators] Total: ${totalLines.value}, Error: ${errorLines.value}, Warnings: ${warnLines.value}, Info: ${infoLines.value}")

	def usingRDDTransformations(sc: SparkContext, rdd: RDD[String]): Unit = {
		val errorLines = rdd.filter(_.startsWith("error:")).count()
		val infoLines = rdd.filter(_.startsWith("info:")).count()
		val warnLines = rdd.filter(_.startsWith("warn:")).count()

		println(s">>> [Using RDD Transformations] Error: $errorLines, Warnings: $warnLines, Info: $infoLines")
Source File: TestJoins.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples

import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner }
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import scala.Iterator

object TestJoins {
	def main(args: Array[String]): Unit = {
		val sc = new SparkContext(new SparkConf().setAppName("TestJoinJob"))

		val x = sc.parallelize(List((1, 2), (1, 3), (2, 3), (2, 4))).partitionBy(new HashPartitioner(2)).cache
		val y = sc.parallelize(List((2, 5), (2, 6))).partitionBy(new HashPartitioner(2)).cache


		println(">>> joining x with y")
		val joinRDD = x.join(y).cache

		println(">>> left outer join of x with y")
		val leftJoin = x.leftOuterJoin(y).cache

		println(">>> right outer join of x with y")
		val rightJoin = x.rightOuterJoin(y).cache
	def inspectRDD[T](rdd: RDD[T]): Unit = {
		println(">>> Partition length...")
		rdd.mapPartitions(f => Iterator(f.length), true).foreach(println)
		println(">>> Partition data...")
		rdd.foreachPartition(f => f.foreach(println))
Source File: RedisSourceRdd.scala    From spark-redis   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up

import com.redislabs.provider.redis.RedisConfig
import com.redislabs.provider.redis.util.ConnectionUtils.withConnection
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, SparkContext, TaskContext}

class RedisSourceRdd(sc: SparkContext, redisConfig: RedisConfig,
                     offsetRanges: Seq[RedisSourceOffsetRange], autoAck: Boolean = true)
  extends RDD[StreamEntry](sc, Nil) {

  override def compute(split: Partition, context: TaskContext): Iterator[StreamEntry] = {
    val partition = split.asInstanceOf[RedisSourceRddPartition]
    val offsetRange = partition.offsetRange
    val streamReader = new RedisStreamReader(redisConfig)

  override protected def getPartitions: Array[Partition] = { { case (e, i) => RedisSourceRddPartition(i, e) }

case class RedisSourceRddPartition(index: Int, offsetRange: RedisSourceOffsetRange)
  extends Partition 
Source File: ManyValueBenchmarkSuite.scala    From spark-redis   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.redislabs.provider.redis.df.benchmark

import com.redislabs.provider.redis.env.RedisClusterEnv
import com.redislabs.provider.redis.util.Person
import org.apache.spark.rdd.RDD

trait ManyValueBenchmarkSuite extends DataframeBenchmarkSuite with RedisClusterEnv {

  private def num = 1000000

  override def suiteTags: String = s"${super.suiteTags}, Many:$num"

  override def rdd(): RDD[Person] = {
    val partitionsNum = 8
    val sectionLength = num / partitionsNum
      .parallelize(0 until partitionsNum, partitionsNum)
      .mapPartitions {
          .flatMap { i =>
            val start = i * sectionLength
            val end = start + sectionLength + 1
            Stream.range(start, end)
          .map { i =>
            Person(s"John-$i", 30, "60 Wall Street", 150.5)
Source File: Dijkstra.scala    From graphx-algorithm   with GNU General Public License v2.0 5 votes vote down vote up
package org.apache.spark.graphx.iiot.shortestpath

import org.apache.spark.graphx.GraphLoaderPlus
import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

    if (args.length < 2) sys.error("Usage: inputFileName sourceId [outputFileDirectory]")

    val inputFile = args(0)
    val sourceId: VertexId = args(1).toInt

    val sc = new SparkContext(new SparkConf().setAppName("Dijkstra Algorithm"))

    val graph = GraphLoaderPlus.edgeListFile(sc, inputFile)

    // `mapEdges` sometimes may be needed such as
    // `g.mapEdges(e => (new scala.util.Random).nextInt(100))`
    val g = graph.mapVertices((id, _) =>
      if (id == sourceId) Array(0.0, id)
      else Array(Double.PositiveInfinity, id)

    val sssp = g.pregel(Array(Double.PositiveInfinity, -1))(
      (id, dist, newDist) => {
        if (dist(0) < newDist(0)) dist
        else newDist
      triplet => {
        if (triplet.srcAttr(0) + triplet.attr < triplet.dstAttr(0)) {
          Iterator((triplet.dstId, Array(triplet.srcAttr(0) + triplet.attr, triplet.srcId)))
        else {
      (a, b) => {
        if (a(0) < b(0)) a
        else b

    val format_sssp: RDD[String] = =>
      "Vertex " + vertex._1 + ": distance is " + vertex._2(0) + ", previous node is Vertex " + vertex._2(1).toInt)

    if (args.length > 2) {
      val outputFileDir = args(2)
Source File: ReplicatedVertexView.scala    From graphx-algorithm   with GNU General Public License v2.0 5 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD

import org.apache.spark.graphx._

  def updateVertices(updates: VertexRDD[VD]): ReplicatedVertexView[VD, ED] = {
    val shippedVerts = updates.shipVertexAttributes(hasSrcId, hasDstId)
      .setName("ReplicatedVertexView.updateVertices - shippedVerts %s %s (broadcast)".format(
        hasSrcId, hasDstId))

    val newEdges = edges.withPartitionsRDD(edges.partitionsRDD.zipPartitions(shippedVerts) {
      (ePartIter, shippedVertsIter) => {
        case (pid, edgePartition) =>
          (pid, edgePartition.updateVertices(shippedVertsIter.flatMap(_._2.iterator)))
    new ReplicatedVertexView(newEdges, hasSrcId, hasDstId)
Example 120
Source File: EdgeRDDImpl.scala    From graphx-algorithm   with GNU General Public License v2.0 5 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{OneToOneDependency, HashPartitioner, TaskContext}
import org.apache.spark.rdd.RDD

import org.apache.spark.graphx._

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if ( != null) {
      partitionsRDD.setName( + ", " + _name)
    } else {

  override def count(): Long = { + _)

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) =>

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) =
        val (_, otherEPart) =
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) =
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
    }, preservesPartitioning = true))

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)

Source File: RoutingTablePartition.scala    From graphx-algorithm   with GNU General Public License v2.0 5 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.ClassTag

import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.util.collection.{BitSet, PrimitiveVector}

import org.apache.spark.graphx._
import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap

import org.apache.spark.graphx.impl.RoutingTablePartition.RoutingTableMessage

object RoutingTablePartition {
  def foreachWithinEdgePartition
      (pid: PartitionID, includeSrc: Boolean, includeDst: Boolean)
      (f: VertexId => Unit) {
    val (vidsCandidate, srcVids, dstVids) = routingTable(pid)
    val size = vidsCandidate.length
    if (includeSrc && includeDst) {
      // Avoid checks for performance
    } else if (!includeSrc && !includeDst) {
      // Do nothing
    } else {
      val relevantVids = if (includeSrc) srcVids else dstVids
      relevantVids.iterator.foreach { i => f(vidsCandidate(i)) }
Source File: SparkBatchAdapter.scala    From eventuate   with Apache License 2.0 5 votes vote down vote up
package com.rbmhtechnology.eventuate.adapter.spark

import akka.serialization.SerializationExtension

import com.datastax.spark.connector._
import com.datastax.spark.connector.types._
import com.rbmhtechnology.eventuate.DurableEvent
import com.rbmhtechnology.eventuate.log.cassandra.CassandraEventLogSettings
import com.typesafe.config._

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

  def eventBatch(logId: String, fromSequenceNr: Long = 1L): RDD[DurableEvent] = {
    context.cassandraTable(cassandraSettings.keyspace, s"${cassandraSettings.tablePrefix}_$logId")
      .select("event").where(s"sequence_nr >= $fromSequenceNr").as((event: DurableEvent) => event)

private class DurableEventConverter(config: Config) extends TypeConverter[DurableEvent] {
  import scala.reflect.runtime.universe._

  val converter = implicitly[TypeConverter[Array[Byte]]]

  // --------------------------------------
  //  FIXME: how to shutdown actor system?
  // --------------------------------------

  @transient lazy val system = ActorSystem("TypeConverter", config)
  @transient lazy val serial = SerializationExtension(system)

  def targetTypeTag = implicitly[TypeTag[DurableEvent]]
  def convertPF = {
    case obj => deserialize(converter.convert(obj))

  def deserialize(bytes: Array[Byte]): DurableEvent =
    serial.deserialize(bytes, classOf[DurableEvent]).get
Source File: GenerateVerticesExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch08

// scalastyle:off println
import org.apache.log4j.{Level, Logger}

import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDD

object GenerateVerticesExample {

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      new IllegalArgumentException("Invalid arguments")
    // ログレベルをWARNに設定

    // SparkContextの生成
    val conf = new SparkConf().setAppName("GenerateVerticesExample")
    val sc = new SparkContext(conf)

    // 引数から設定値を取得
    val (numProducts, numUsers): (Int, Int) = (args(0).toInt, args(1).toInt)
    implicit val recOpts: RecommendLogOptions = RecommendLogOptions(numProducts, numUsers)


  def run(sc: SparkContext)
         (implicit recOpts: RecommendLogOptions)
  : Unit = {

    // 商品リスト、ユーザリストのRDDを生成
    val products: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genProductList)
    val users: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genUserList)

    // 商品リスト20件を表示
    println("get top 20 products:")
    products.take(20).foreach(x => println(s"id: ${},\ttype: ${x.kind},\tname: ${}"))

    // ユーザリスト20件を表示
    println("get top 20 users:")
    users.take(20).foreach(x => println(s"id: ${},\ttype: ${x.kind},\tname: ${}"))

Example 124
Source File: gihyo_6_3_Transform.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.streaming.dstream.InputDStream

object gihyo_6_3_Transform {
  def main(args: Array[String]) {
    if (args.length != 2) {
      new IllegalArgumentException("Invalid arguments")
    val targetHost = args(0)
    val targetHostPort = args(1).toInt

    val conf = new SparkConf().setAppName("NetworkWordCount")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))
    val lines = ssc.socketTextStream(targetHost, targetHostPort)
    val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment")))
    run(lines, blackList)


  def run(stream: InputDStream[String], blackList: RDD[(String, String)]) {
    val userList = => (x, "action:Login")).transform(rdd => {
      val tmpUserList = rdd.leftOuterJoin(blackList)
      tmpUserList.filter(user => (user._2._2 == None))
Example 125
Source File: gihyo_6_3_JoinSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}
import scala.collection.mutable
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_JoinSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines1 = mutable.Queue[RDD[String]]()
    val ds1 = ssc.queueStream(lines1)
    val lines2 = mutable.Queue[RDD[String]]()
    val ds2 = ssc.queueStream(lines2)
    val clock = new StreamingContextWrapper(ssc).manualClock, ds2)

    lines1 += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
    lines2 += sc.makeRDD(Seq("key2", "key3", "key4")) // test data
Example 126
package jp.gihyo.spark.ch06

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}
import scala.collection.mutable
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper
import java.nio.file.Files

class gihyo_6_3_CountByValueAndWindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir), 2, 1)
    (1 to 3).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
Example 127
Source File: gihyo_6_3_MapSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_MapSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    lines += sc.makeRDD(Seq("key1", "key2", "key3", "key1")) // test data
Example 128
Source File: gihyo_6_3_TwitterStreamSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
import java.nio.file.Files

import scala.collection.mutable

import twitter4j.{Status, TwitterObjectFactory}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

class gihyo_6_3_TwitterStreamSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[Status]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString

    (1 to 2).foreach { case i =>
      // test data
      lines += sc.makeRDD(Seq(

object MockTweetGenerator {
  // Creates a tweet status from a JSON file
  def createMockStatusFromJson(): Status = {
    val jsonFile = getClass.getResource("/streaming/test-tweet.json").getPath
Example 129
Source File: gihyo_6_3_FilterSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_FilterSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    lines += sc.makeRDD(Seq("lengthOver5", "les1", "les2")) // test data
Source File: gihyo_6_3_FlatMapSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_FlatMapSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    // test data
    lines += sc.makeRDD(Seq("Apache Spark is a fast and general-purpose cluster computing system."))
Source File: gihyo_6_3_CountSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_CountSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock, 2, 1)
    (1 to 2).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
Source File: gihyo_6_3_UnionSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_UnionSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = (1 to 3).map(x => mutable.Queue[RDD[(String, String)]]())
    val dss = => ssc.queueStream(x))
    val clock = new StreamingContextWrapper(ssc).manualClock, dss)
    ssc.start() => x += sc.makeRDD(Seq(("", "key1"), ("", "key2"), ("", "key3")))) //test data
Source File: gihyo_6_3_ReduceByKeyAndWindowSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceByKeyAndWindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock, 2, 1)
    (1 to 3).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
Source File: gihyo_6_3_ReduceByKeySuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceByKeySuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    lines += sc.makeRDD(Seq("key1", "key2", "key3", "key1")) // test data
Source File: gihyo_6_3_CountByWindowSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import java.nio.file.Files

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_CountByWindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir), 2, 1)
    (1 to 3).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
Source File: gihyo_6_3_UpdateStateByKeySuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import java.nio.file.Files

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_UpdateStateByKeySuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
Source File: gihyo_6_3_RepartitionSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_RepartitionSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
package jp.gihyo.spark.ch06

import java.nio.file.Files

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceByKeyAndWindowEfficientSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock, 2, 1)
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    (1 to 2).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
Source File: gihyo_6_3_KafkaStreamSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable
import java.nio.file.Files

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_KafkaStreamSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[(String, String)]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock, Files.createTempDirectory("KafkaStreamSuite").toString, 2, 1)
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    (1 to 2).foreach { case i =>
      lines += sc.makeRDD(Seq(("", "userid:userid001,action:view,pageid:value1"),
        ("", "userid:userid002,action:click,pageid:value2"),
        ("", "userid:userid003,action:view,pageid:value3"),
        ("", "userid:userid001,action:view,pageid:value4"))) // test data
Source File: gihyo_6_3_WiindowSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}
import scala.collection.mutable
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_WindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock, 2, 1)
    (1 to 3).foreach {
      case i => {
        lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
Source File: gihyo_6_3_CogroupSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_CogroupSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val lines2 = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val ds2 = ssc.queueStream(lines2)
    val clock = new StreamingContextWrapper(ssc).manualClock
    lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
    lines2 += sc.makeRDD(Seq("key2", "key3", "key4")) // test data, ds2)
Source File: gihyo_6_2_1_SampleSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_2_1_SampleSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    lines += sc.makeRDD(Seq("word1 word2", "word3 word1", "word4 word2")) // test data
Source File: gihyo_6_3_TransformSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_TransformSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment"))), blackList)
    lines += sc.makeRDD(Seq("user001", "user002", "user003")) // test data
Source File: gihyo_6_3_CountByValueSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_CountByValueSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    lines += sc.makeRDD(Seq("key1", "key2", "key3", "key1")) // test data
Source File: gihyo_6_3_ReduceSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    lines += sc.makeRDD(Seq("gi", "jutsu", "hyoron", "sha")) // test data
Source File: gihyo_6_3_ReduceByWindowSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceByWindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock, 2, 1)
    (1 to 2).foreach {
      case i => {
        lines += sc.makeRDD(Seq("gi", "jutsu", "hyoron", "sha")) // test data
Source File: FileReader.scala    From bdd-spark   with MIT License 5 votes vote down vote up
import org.apache.spark.rdd.RDD

trait FileReader {
  def readLinesToRdd(filename : String) : RDD[String]
  def readText(filename : String) : String

object FileReader {
  class RealFileReader extends FileReader{
    override def readLinesToRdd(filename: String): RDD[String] = {

    override def readText(filename: String): String = {

  def apply() : FileReader = new RealFileReader
Source File: RecommendationModelReuse.scala    From Scala-Machine-Learning-Projects   with MIT License 5 votes vote down vote up
package com.packt.ScalaML.MovieRecommendation

import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import scala.Tuple2
import org.apache.spark.rdd.RDD

object RecommendationModelReuse {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .config("spark.sql.warehouse.dir", "E:/Exp/").

    val ratigsFile = "data/ratings.csv"
    val ratingDF ="com.databricks.spark.csv").option("header", true).load(ratigsFile)
    val selectedRatingsDF ="userId"), ratingDF.col("movieId"), ratingDF.col("rating"), ratingDF.col("timestamp"))

    // Randomly split ratings RDD into training data RDD (75%) and test data RDD (25%)
    val splits = selectedRatingsDF.randomSplit(Array(0.75, 0.25), seed = 12345L)
    val testData = splits(1)

    val testRDD = => {
      val userId = row.getString(0)
      val movieId = row.getString(1)
      val ratings = row.getString(2)
      Rating(userId.toInt, movieId.toInt, ratings.toDouble)

    //Load the workflow back
    val same_model = MatrixFactorizationModel.load(spark.sparkContext, "model/MovieRecomModel/")

    // Making Predictions. Get the top 6 movie predictions for user 668
    println("Rating:(UserID, MovieID, Rating)")
    val topRecsForUser = same_model.recommendProducts(458, 10)
    for (rating <- topRecsForUser) {

    val rmseTest = MovieRecommendation.computeRmse(same_model, testRDD, true)
    println("Test RMSE: = " + rmseTest) //Less is better

    //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668
    println("Recommendations: (MovieId => Rating)")
    val recommendationsUser = same_model.recommendProducts(458, 10) => (rating.product, rating.rating)).foreach(println)

Source File: MovieRecommendation.scala    From Scala-Machine-Learning-Projects   with MIT License 5 votes vote down vote up
package com.packt.ScalaML.MovieRecommendation

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SQLImplicits
import org.apache.spark.sql._
import org.apache.spark.sql.Dataset
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import scala.Tuple2
import org.apache.spark.rdd.RDD

object MovieRecommendation {  
  //Compute the RMSE to evaluate the model. Less the RMSE better the model and it's prediction capability. 
  def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean): Double = {
    val predictions: RDD[Rating] = model.predict( => (x.user, x.product)))
    val predictionsAndRatings = { x => ((x.user, x.product), x.rating)
    }.join( => ((x.user, x.product), x.rating))).values
    if (implicitPrefs) {
      println("(Prediction, Rating)")
    math.sqrt( => (x._1 - x._2) * (x._1 - x._2)).mean())

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .config("spark.sql.warehouse.dir", "E:/Exp/").

    val ratigsFile = "data/ratings.csv"
    val df1 ="com.databricks.spark.csv").option("header", true).load(ratigsFile)

    val ratingsDF ="userId"), df1.col("movieId"), df1.col("rating"), df1.col("timestamp"))

    val moviesFile = "data/movies.csv"
    val df2 ="com.databricks.spark.csv").option("header", "true").load(moviesFile)

    val moviesDF ="movieId"), df2.col("title"), df2.col("genres"))



    var rmseTest = computeRmse(model, testRDD, true)
    println("Test RMSE: = " + rmseTest) //Less is better

    //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668
    println("Recommendations: (MovieId => Rating)")
    val recommendationsUser = model.recommendProducts(668, 6) => (rating.product, rating.rating)).foreach(println)

Source File: HbRddWriter.scala    From hbrdd   with Apache License 2.0 5 votes vote down vote up
package top.spoofer.hbrdd.hbsupport

import org.apache.hadoop.hbase.client.Put
import org.apache.spark.rdd.RDD
import top.spoofer.hbrdd.config.HbRddConfig
import top.spoofer.hbrdd.unit.HbRddFormatsWriter
import top.spoofer.hbrdd._
import HbRddWritPuter._

trait HbRddWriter {
  type TsValue[A] = (Long, A) // (ts, A)
  val LATEST_TIMESTAMP = Long.MaxValue
final class SingleFamilyRDDWriter[A](
    val rdd: RDD[(String, Map[String, A])],
    val put: HbRddPuter[A]
) extends HbRddWritCommon[A] with Serializable {
  def put2Hbase(tableName: String, family: String)(implicit config: HbRddConfig) = {
    val job = createJob(tableName, config.getHbaseConfig)
    rdd.flatMap({ case (rowId, data) => convert2Writable(rowId, Map(family -> data), put) })
Source File: XmlReader.scala    From spark-xml   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.xml

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SQLContext, SparkSession}
import org.apache.spark.sql.types.StructType
import com.databricks.spark.xml.util.XmlFile
import com.databricks.spark.xml.util.FailFastMode

  @deprecated("Use xmlFile(SparkSession, ...)", "0.5.0")
  def xmlFile(sqlContext: SQLContext, path: String): DataFrame = {
    // We need the `charset` and `rowTag` before creating the relation.
    val (charset, rowTag) = {
      val options = XmlOptions(parameters.toMap)
      (options.charset, options.rowTag)
    val relation = XmlRelation(
      () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag),

  @deprecated("Use xmlRdd(SparkSession, ...)", "0.5.0")
  def xmlRdd(sqlContext: SQLContext, xmlRDD: RDD[String]): DataFrame = {
    val relation = XmlRelation(
      () => xmlRDD,

Source File: XmlFile.scala    From spark-xml   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.xml.util

import java.nio.charset.Charset

import scala.collection.Map

import com.databricks.spark.xml.parsers.StaxXmlGenerator
import com.sun.xml.txw2.output.IndentingXMLStreamWriter
import{Text, LongWritable}

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.sql.DataFrame
import com.databricks.spark.xml.{XmlOptions, XmlInputFormat}

private[xml] object XmlFile {
  val DEFAULT_INDENT = "    "

  def withCharset(
      context: SparkContext,
      location: String,
      charset: String,
      rowTag: String): RDD[String] = {
    // This just checks the charset's validity early, to keep behavior
    context.hadoopConfiguration.set(XmlInputFormat.START_TAG_KEY, s"<$rowTag>")
    context.hadoopConfiguration.set(XmlInputFormat.END_TAG_KEY, s"</$rowTag>")
    context.hadoopConfiguration.set(XmlInputFormat.ENCODING_KEY, charset)
      classOf[Text]).map { case (_, text) => new String(text.getBytes, 0, text.getLength, charset) }

  def saveAsXmlFile(
      dataFrame: DataFrame,
      path: String,
      parameters: Map[String, String] = Map()): Unit = {
    val options = XmlOptions(parameters.toMap)
    val codecClass = CompressionCodecs.getCodecClass(options.codec)
    val rowSchema = dataFrame.schema
    val indent = XmlFile.DEFAULT_INDENT

    val xmlRDD = dataFrame.rdd.mapPartitions { iter =>
      val factory = XMLOutputFactory.newInstance()
      val writer = new CharArrayWriter()
      val xmlWriter = factory.createXMLStreamWriter(writer)
      val indentingXmlWriter = new IndentingXMLStreamWriter(xmlWriter)

      new Iterator[String] {
        var firstRow: Boolean = true
        var lastRow: Boolean = true

        override def hasNext: Boolean = iter.hasNext || firstRow || lastRow

        override def next: String = {
          if (iter.nonEmpty) {
            if (firstRow) {
              firstRow = false
            val xml = {
          } else {
            if (!firstRow) {
              lastRow = false
            } else {
              // This means the iterator was initially empty.
              firstRow = false
              lastRow = false

    codecClass match {
      case null => xmlRDD.saveAsTextFile(path)
      case codec => xmlRDD.saveAsTextFile(path, codec)
Source File: XmlRelation.scala    From spark-xml   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.xml


import org.apache.hadoop.fs.Path

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.sources.{PrunedScan, InsertableRelation, BaseRelation, TableScan}
import org.apache.spark.sql.types._
import com.databricks.spark.xml.util.{InferSchema, XmlFile}
import com.databricks.spark.xml.parsers.StaxXmlParser

case class XmlRelation protected[spark] (
    baseRDD: () => RDD[String],
    location: Option[String],
    parameters: Map[String, String],
    userSchema: StructType = null)(@transient val sqlContext: SQLContext)
  extends BaseRelation
  with InsertableRelation
  with PrunedScan {

  private val options = XmlOptions(parameters)

  override val schema: StructType = {
    Option(userSchema).getOrElse {

  override def buildScan(requiredColumns: Array[String]): RDD[Row] = {
    val requiredFields =
    val requestedSchema = StructType(requiredFields)

  // The function below was borrowed from JSONRelation
  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    val filesystemPath = location match {
      case Some(p) => new Path(p)
      case None =>
        throw new IOException(s"Cannot INSERT into table with no path defined")

    val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)

    if (overwrite) {
      try {
        fs.delete(filesystemPath, true)
      } catch {
        case e: IOException =>
          throw new IOException(
            s"Unable to clear output directory ${filesystemPath.toString} prior"
              + s" to INSERT OVERWRITE a XML table:\n${e.toString}")
      // Write the data. We assume that schema isn't changed, and we won't update it.
      XmlFile.saveAsXmlFile(data, filesystemPath.toString, parameters)
    } else {
      throw new IllegalArgumentException("XML tables only support INSERT OVERWRITE for now.")
Source File: SparkSuite.scala    From spark-sorted   with Apache License 2.0 5 votes vote down vote up
package com.tresata.spark.sorted

import org.scalactic.Equality
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{ Dataset, SparkSession }

object SparkSuite {
  lazy val spark: SparkSession = {
    val session = SparkSession.builder
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.ui.enabled", false)
      .config("spark.sql.shuffle.partitions", 4)
  lazy val sc: SparkContext = spark.sparkContext

  lazy val jsc = new JavaSparkContext(sc)
  def javaSparkContext() = jsc

trait SparkSuite {
  implicit lazy val spark: SparkSession = SparkSuite.spark
  implicit lazy val sc: SparkContext = SparkSuite.spark.sparkContext

  implicit def rddEq[X]: Equality[RDD[X]] = new Equality[RDD[X]] {
    private def toCounts[Y](s: Seq[Y]): Map[Y, Int] = s.groupBy(identity).mapValues(_.size)

    def areEqual(a: RDD[X], b: Any): Boolean = b match {
      case s: Seq[_] => toCounts(a.collect) == toCounts(s)
      case rdd: RDD[_] => toCounts(a.collect) == toCounts(rdd.collect)

  implicit def gsEq[K, V](implicit rddEq: Equality[RDD[(K, V)]]): Equality[GroupSorted[K, V]] = new Equality[GroupSorted[K, V]] {
    def areEqual(a: GroupSorted[K, V], b: Any): Boolean = rddEq.areEqual(a, b)
  implicit def dsEq[X](implicit rddEq: Equality[RDD[X]]): Equality[Dataset[X]] = new Equality[Dataset[X]] {
    def areEqual(a: Dataset[X], b: Any): Boolean = b match {
      case ds: Dataset[_] => rddEq.areEqual(a.rdd, ds.rdd)
      case x => rddEq.areEqual(a.rdd, x)
Example 155
Source File: BinaryClassifierEvaluator.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.evaluation

  def evaluate(predictions: RDD[Boolean], actuals: RDD[Boolean]): BinaryClassificationMetrics = { { case (pred, actual) =>
      val tp = if (pred && actual) 1d else 0d
      val fp = if (pred && !actual) 1d else 0d
      val tn = if (!pred && !actual) 1d else 0d
      val fn = if (!pred && actual) 1d else 0d
      BinaryClassificationMetrics(tp, fp, tn, fn)
    }.reduce(_ merge _)
Example 156
Source File: AugmentedExamplesEvaluator.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.evaluation

import breeze.linalg._
import keystoneml.nodes.util.MaxClassifier
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

object AggregationPolicyType extends Enumeration {
  type AggregationPolicyType = Value
  val average, borda = Value

class AugmentedExamplesEvaluator[T : ClassTag](
    names: RDD[T],
    numClasses: Int,
    policy: AggregationPolicyType.Value = AggregationPolicyType.average)
  extends Evaluator[DenseVector[Double], Int, MulticlassMetrics] with Serializable {

  def averagePolicy(preds: Array[DenseVector[Double]]): DenseVector[Double] = {
    preds.reduce(_ + _) :/ preds.size.toDouble

  def bordaPolicy(preds: Array[DenseVector[Double]]): DenseVector[Double] = {
    val ranks = { vec =>
      val sortedPreds = vec.toArray.zipWithIndex.sortBy(_._1).map(_._2)
      val rank = DenseVector(sortedPreds.zipWithIndex.sortBy(_._1).map(x => x._2.toDouble))
    ranks.reduceLeft(_ + _)

  def evaluate(
      predicted: RDD[DenseVector[Double]],
      actualLabels: RDD[Int]): MulticlassMetrics = {

    val aggFunc = policy match {
      case AggregationPolicyType.borda => bordaPolicy _
      case _ => averagePolicy _
    // associate a name with each predicted, actual
    val namedPreds =

    // group by name to get all the predicted values for a name
    val groupedPreds = namedPreds.groupByKey(names.partitions.length).map { case (group, iter) =>
      val predActuals = iter.toArray // this is a array of tuples
      val predsForName =
      assert( == 1)
      val actualForName: Int =

      (predsForName, actualForName)

    // Averaging policy
    val finalPred = => (aggFunc(x._1), x._2) )
    val finalPredictedLabels = MaxClassifier(
    val finalActualLabels =

    val ret = new MulticlassClassifierEvaluator(numClasses).evaluate(finalPredictedLabels, finalActualLabels)
Example 157
Source File: MeanAveragePrecisionEvaluator.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.evaluation

import breeze.linalg.DenseVector
import org.apache.spark.SparkContext._

  private def getAP(precisions: Array[Double], recalls: Array[Double]) = {
    var ap = 0.0
    val levels = (0 to 10).map(x => x / 10.0)
    levels.foreach { t =>
      // Find where recalls are greater than t and precision values at those indices
      val px = recalls.toSeq.zipWithIndex.filter(x => x._1 >= t).map(x => precisions(x._2))
      val p = if (px.isEmpty) {
      } else {
      ap = ap + p / 11.0

Example 158
Source File: Stats.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.utils

import java.util.{Random => JRandom}

import breeze.linalg._
import breeze.numerics._
import breeze.stats._
import breeze.stats.distributions._
import keystoneml.nodes.util.TopKClassifier
import org.apache.spark.rdd.RDD

  def normalizeRows(mat: DenseMatrix[Double], alpha: Double = 1.0): DenseMatrix[Double] = {
    // FIXME: This currently must convert the matrices to double due to breeze implicits
    // TODO: Could optimize, use way fewer copies
    val rowMeans: DenseVector[Double] = mean(mat(*, ::)).map(x => if (x.isNaN) 0 else x)
    val variances: DenseVector[Double] = sum((mat(::, *) - rowMeans) :^= 2.0, Axis._1) :/= (mat.cols.toDouble - 1.0)
    val sds: DenseVector[Double] = sqrt(variances + alpha.toDouble).map(x => if (x.isNaN) math.sqrt(alpha) else x)

    val out = mat(::, *) - rowMeans
    out(::, *) /= sds

Example 159
Source File: GatherTransformerOperator.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.workflow

import org.apache.spark.rdd.RDD

private[workflow] case class GatherTransformerOperator[T]() extends TransformerOperator {
  override private[workflow] def singleTransform(inputs: Seq[DatumExpression]): Any = {[T])

  override private[workflow] def batchTransform(inputs: Seq[DatasetExpression]): RDD[_] = {[RDD[T]].map(t => Seq(t))).reduceLeft((x, y) => { => z._1 ++ z._2)
Example 160
Source File: PipelineDataset.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.workflow

import org.apache.spark.rdd.RDD

class PipelineDataset[T] private[workflow](executor: GraphExecutor, sink: SinkId)
  extends PipelineResult[RDD[T]](

object PipelineDataset {
  private[workflow] def apply[T](rdd: RDD[T]): PipelineDataset[T] = {
    val emptyGraph = Graph(Set(), Map(), Map(), Map())
    val (graphWithDataset, nodeId) = emptyGraph.addNode(new DatasetOperator(rdd), Seq())
    val (graph, sinkId) = graphWithDataset.addSink(nodeId)

    new PipelineDataset[T](new GraphExecutor(graph), sinkId)
Example 161
Source File: KernelMatrix.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import scala.collection.mutable.HashMap
import breeze.linalg._

import org.apache.spark.rdd.RDD

import keystoneml.utils.{MatrixUtils, Stats}
import keystoneml.workflow.{Transformer, LabelEstimator}

class BlockKernelMatrix[T: ClassTag](
    val kernelGen: KernelTransformer[T],
    val data: RDD[T],
    val cacheKernel: Boolean)
  extends KernelMatrix {

  val colBlockCache = HashMap.empty[Seq[Int], RDD[DenseMatrix[Double]]]
  val diagBlockCache = HashMap.empty[Seq[Int], DenseMatrix[Double]]

  def apply(colIdxs: Seq[Int]): RDD[DenseMatrix[Double]] = {
    if (colBlockCache.contains(colIdxs)) {
    } else {
      val (kBlock, diagBlock) = kernelGen.computeKernel(data, colIdxs)
      if (cacheKernel) {
        colBlockCache += (colIdxs -> kBlock)
        diagBlockCache += (colIdxs -> diagBlock)

  def unpersist(colIdxs: Seq[Int]): Unit = {
    if (colBlockCache.contains(colIdxs) && !cacheKernel) {

  def diagBlock(idxs: Seq[Int]): DenseMatrix[Double] = {
    if (!diagBlockCache.contains(idxs)) {
      val (kBlock, diagBlock) = kernelGen.computeKernel(data, idxs)
      if (cacheKernel) {
        colBlockCache += (idxs -> kBlock)
        diagBlockCache += (idxs -> diagBlock)
    } else {
Example 162
Source File: LinearMapper.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import edu.berkeley.cs.amplab.mlmatrix.{NormalEquations, RowPartitionedMatrix}
import keystoneml.nodes.stats.{StandardScaler, StandardScalerModel}
import keystoneml.nodes.util.Densify
import org.apache.spark.rdd.RDD
import keystoneml.workflow.{LabelEstimator, Transformer}

object LinearMapEstimator extends Serializable {
  def apply(lambda: Option[Double] = None) = new LinearMapEstimator(lambda)

  def computeCost(
      trainingFeatures: RDD[DenseVector[Double]],
      trainingLabels: RDD[DenseVector[Double]],
      lambda: Double,
      x: DenseMatrix[Double],
      bOpt: Option[DenseVector[Double]]): Double = {

    val nTrain = trainingLabels.count
    val modelBroadcast = trainingLabels.context.broadcast(x)
    val bBroadcast = trainingLabels.context.broadcast(bOpt)

    val axb = trainingFeatures.mapPartitions(rows => {
      MatrixUtils.rowsToMatrixIter(rows).flatMap { rMat =>
        val mat = rMat * modelBroadcast.value
        val out = { b =>
          mat(*, ::) :+= b


    val cost = { part =>
      val axb = part._1
      val labels = part._2
      val out = axb - labels
      math.pow(norm(out), 2)
    }.reduce(_ + _)

    if (lambda == 0) {
    } else {
      val wNorm = math.pow(norm(x.toDenseVector), 2)
      cost/(2.0*nTrain.toDouble) + lambda/2.0 * wNorm
Source File: LocalLeastSquaresEstimator.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import breeze.stats._
import keystoneml.nodes.stats.StandardScalerModel
import org.apache.spark.rdd.RDD
import keystoneml.utils.MatrixUtils
import keystoneml.workflow.LabelEstimator

  def trainWithL2(
   trainingFeatures: RDD[DenseVector[Double]],
   trainingLabels: RDD[DenseVector[Double]],
   lambda: Double): LinearMapper[DenseVector[Double]] = {

    val A_parts = trainingFeatures.mapPartitions { x =>
    val b_parts = trainingLabels.mapPartitions { x =>

    val A_local = DenseMatrix.vertcat(A_parts:_*)
    val b_local = DenseMatrix.vertcat(b_parts:_*)

    val featuresMean = mean(A_local(::, *)).t
    val labelsMean = mean(b_local(::, *)).t

    val A_zm = A_local(*, ::) - featuresMean
    val b_zm = b_local(*, ::) - labelsMean

    val AAt = A_zm * A_zm.t
    val model = A_zm.t * ( (AAt + (DenseMatrix.eye[Double](AAt.rows) :* lambda)) \ b_zm )
    LinearMapper(model, Some(labelsMean), Some(new StandardScalerModel(featuresMean, None)))

Source File: LinearDiscriminantAnalysis.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import breeze.stats._
import org.apache.spark.rdd.RDD
import keystoneml.utils.MatrixUtils
import keystoneml.workflow.LabelEstimator

  override def fit(data: RDD[DenseVector[Double]], labels: RDD[Int]): LinearMapper[DenseVector[Double]] = {
    val sample =

  def computeLDA(dataAndLabels: Array[(Int, DenseVector[Double])]): LinearMapper[DenseVector[Double]] = {
    val featuresByClass = dataAndLabels.groupBy(_._1) => MatrixUtils.rowsToMatrix(
    val meanByClass = => mean(f(::, *))) // each mean is a row vector, not col

    val sW = => {
      val featuresMinusMean = f._1(*, ::) - f._2.t // row vector, not column
      featuresMinusMean.t * featuresMinusMean

    val numByClass = : Double)
    val features = MatrixUtils.rowsToMatrix(
    val totalMean = mean(features(::, *)) // A row-vector, not a column-vector

    val sB = {
      case (classMean, classNum) => {
        val m = classMean - totalMean
        (m.t * m) :* classNum

    val eigen = eig((inv(sW): DenseMatrix[Double]) * sB)
    val eigenvectors = (0 until eigen.eigenvectors.cols).map(eigen.eigenvectors(::, _).toDenseMatrix.t)

    val topEigenvectors = => -math.abs(x._2)).map(_._1).take(numDimensions)
    val W = DenseMatrix.horzcat(topEigenvectors:_*)

Example 165
Source File: LeastSquaresEstimator.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import keystoneml.nodes.util.{Densify, Sparsify}
import org.apache.spark.rdd.RDD
import keystoneml.pipelines.Logging
import keystoneml.workflow._

import scala.reflect._

class LeastSquaresEstimator[T <: Vector[Double]: ClassTag](
    lambda: Double = 0,
    numMachines: Option[Int] = None,
    cpuWeight: Double = 3.8e-4,
    memWeight: Double = 2.9e-1,
    networkWeight: Double = 1.32)
  extends OptimizableLabelEstimator[T, DenseVector[Double], DenseVector[Double]]
    with WeightedNode
    with Logging {

  val options: Seq[(CostModel, LabelEstimator[T, DenseVector[Double], DenseVector[Double]])] = Seq(
      val solver = new DenseLBFGSwithL2[T](new LeastSquaresDenseGradient, regParam = lambda, numIterations = 20)
      (solver, solver)
      val solver = new SparseLBFGSwithL2(new LeastSquaresSparseGradient, regParam = lambda, numIterations = 20)
      (solver, TransformerLabelEstimatorChain(Sparsify(), solver))
      val solver = new BlockLeastSquaresEstimator(1000, 3, lambda = lambda)
      (solver, TransformerLabelEstimatorChain(Densify(), solver))
      val solver = new LinearMapEstimator(Some(lambda))
      (solver, TransformerLabelEstimatorChain(Densify(), solver))

  override val default: LabelEstimator[T, DenseVector[Double], DenseVector[Double]] with WeightedNode = {
    new DenseLBFGSwithL2[T](new LeastSquaresDenseGradient, regParam = lambda, numIterations = 20)

  override def optimize(
      sample: RDD[T],
      sampleLabels: RDD[DenseVector[Double]],
      numPerPartition: Map[Int, Int])
  : LabelEstimator[T, DenseVector[Double], DenseVector[Double]] = {
    val n =
    val d = sample.first().length
    val k = sampleLabels.first().length
    val sparsity = => x.activeSize.toDouble / x.length).sum() / sample.count()

    val realNumMachines = numMachines.getOrElse {
      if (sample.sparkContext.getExecutorStorageStatus.length == 1) {
      } else {
        sample.sparkContext.getExecutorStorageStatus.length - 1

    logDebug(s"Optimizable Param n is $n")
    logDebug(s"Optimizable Param d is $d")
    logDebug(s"Optimizable Param k is $k")
    logDebug(s"Optimizable Param sparsity is $sparsity")
    logDebug(s"Optimizable Param numMachines is $realNumMachines")

    options.minBy(_._1.cost(n, d, k, sparsity, realNumMachines, cpuWeight, memWeight, networkWeight))._2

Example 166
Source File: SparseLinearMapper.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import org.apache.spark.rdd.RDD
import keystoneml.workflow.Transformer

  override def apply(in: RDD[SparseVector[Double]]): RDD[DenseVector[Double]] = {
    val modelBroadcast = in.context.broadcast(x)
    val bBroadcast = in.context.broadcast(bOpt) => {
      val out = modelBroadcast.value.t * row
      bBroadcast.value.foreach { b =>
        out :+= b

Source File: ApproximatePCA.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import breeze.numerics._
import breeze.stats._
import breeze.stats.distributions.{Gaussian, ThreadLocalRandomGenerator, RandBasis}
import com.github.fommil.netlib.LAPACK._
import edu.berkeley.cs.amplab.mlmatrix.util.QRUtils
import org.apache.commons.math3.random.MersenneTwister
import org.apache.spark.rdd.RDD
import org.netlib.util.intW
import keystoneml.pipelines.Logging
  def approximateQ(A: DenseMatrix[Double], l: Int, q: Int, seed: Int = 0): DenseMatrix[Double] = {
    val d = A.cols

    val randBasis: RandBasis = new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(seed)))
    val omega = DenseMatrix.rand(d, l, Gaussian(0,1)(randBasis)) //cpu: d*l, mem: d*l
    val y0 = A*omega //cpu: n*d*l, mem: n*l

    var Q = QRUtils.qrQR(y0)._1 //cpu: n*l**2

    for (i <- 1 to q) {
      val YHat = Q.t * A //cpu: l*n*d, mem: l*d
      val Qh = QRUtils.qrQR(YHat.t)._1 //cpu: d*l^2, mem: d*l

      val Yj = A * Qh //cpu: n*d*l, mem: n*l
      Q = QRUtils.qrQR(Yj)._1 //cpu:  n*l^2, mem: n*l

Example 168
Source File: DistributedPCA.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import breeze.numerics._
import breeze.stats._
import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
import org.apache.spark.rdd.RDD
import org.netlib.util.intW
import keystoneml.pipelines._
import keystoneml.utils.MatrixUtils
import keystoneml.workflow.{Transformer, Estimator}

import edu.berkeley.cs.amplab.mlmatrix.{RowPartition, NormalEquations, RowPartitionedMatrix, TSQR}

  def fit(samples: RDD[DenseVector[Float]]): PCATransformer = {
    new PCATransformer(computePCA(samples, dims))

  def computePCA(dataMat: RDD[DenseVector[Float]], dims: Int): DenseMatrix[Float] = {

    val mat = new RowPartitionedMatrix(dataMat.mapPartitions { part =>
      val dblIter = => convert(x, Double))
    val means = DenseVector(mat.colSums():_*) :/ mat.numRows().toDouble

    val meansBC = dataMat.context.broadcast(means)
    val zeroMeanMat = new RowPartitionedMatrix( { part =>
      RowPartition(part.mat(*, ::) - meansBC.value)

    val rPart = new TSQR().qrR(zeroMeanMat)

    val svd.SVD(u, s, pcaT) = svd(rPart)

    val pca = convert(pcaT.t, Float)

    val matlabConventionPCA = PCAEstimator.enforceMatlabPCASignConvention(pca)

    // Return a subset of the columns.
    matlabConventionPCA(::, 0 until dims)

    n: Long,
    d: Int,
    k: Int,
    sparsity: Double,
    numMachines: Int,
    cpuWeight: Double,
    memWeight: Double,
    networkWeight: Double): Double = {
    val log2NumMachines = math.log(numMachines.toDouble) / math.log(2.0)
    val flops = n.toDouble * d * d / numMachines + d.toDouble * d * d * log2NumMachines
    val bytesScanned = n.toDouble * d
    val network = d.toDouble * d * log2NumMachines
    math.max(cpuWeight * flops, memWeight * bytesScanned) + networkWeight * network
Source File: WrapperTrait.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package SparkER.Wrappers

import SparkER.DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import scala.collection.mutable.MutableList

  def rowToAttributes(columnNames: Array[String], row: Row, explodeInnerFields: Boolean = false, innerSeparator: String = ","): MutableList[KeyValue] = {
    val attributes: MutableList[KeyValue] = new MutableList()
    for (i <- 0 to row.size - 1) {
      try {
        val value = row(i)
        val attributeKey = columnNames(i)

        if (value != null) {
          value match {
            case listOfAttributes: Iterable[Any] =>
              listOfAttributes map {
                attributeValue =>
                  attributes += KeyValue(attributeKey, attributeValue.toString)
            case stringAttribute: String =>
              if (explodeInnerFields) {
                stringAttribute.split(innerSeparator) map {
                  attributeValue =>
                    attributes += KeyValue(attributeKey, attributeValue)
              else {
                attributes += KeyValue(attributeKey, stringAttribute)
            case singleAttribute =>
              attributes += KeyValue(attributeKey, singleAttribute.toString)
      catch {
        case e: Throwable => println(e)
Example 170
package SparkER.Wrappers

import{IOException, _}

import SparkER.DataStructures.Profile
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

  def loadSerializedObject(fileName: String): Any = {
    var `object`: Any = null
    try {
      val file: InputStream = new FileInputStream(fileName)
      val buffer: InputStream = new BufferedInputStream(file)
      val input: ObjectInput = new ObjectInputStream(buffer)
      try {
        `object` = input.readObject
      } finally {
    catch {
      case cnfEx: ClassNotFoundException => {
      case ioex: IOException => {
    return `object`
Source File: Converters.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package SparkER.Utilities

import SparkER.BlockBuildingMethods.TokenBlocking
import org.apache.spark.rdd.RDD
import SparkER.DataStructures._
import org.apache.spark.partial.PartialResult

  def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorIDs.isEmpty) {
          BlockDirty(blockID, Array(profilesID))
        else {
          BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs))

    blocks.filter(_.getComparisonSize() > 0).map(x => x)

Source File: BlockFiltering.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package SparkER.BlockRefinementMethods

import SparkER.DataStructures.{BlockWithComparisonSize, ProfileBlocks}
import SparkER.Utilities.BoundedPriorityQueue
import org.apache.log4j.LogManager
import org.apache.spark.rdd.RDD


  def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = {
    profilesWithBlocks map {
      profileWithBlocks =>
        val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons)
        val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt
        val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons
        ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet)
Source File: SerializedObjectLoader.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Wrappers

import DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

object SerializedObjectLoader extends WrapperTrait {

  def loadProfiles(filePath: String, startIDFrom: Long = 0, realFieldID: String = "", sourceId: Int = 0): RDD[Profile] = {
    @transient lazy val log = org.apache.log4j.LogManager.getRootLogger"SPARKER - Start to loading entities")
    val entities = DataLoaders.SerializedLoader.loadSerializedDataset(filePath)"SPARKER - Loading ended")"SPARKER - Start to generate profiles")
    val profiles: Array[Profile] = new Array(entities.size())

    for (i <- 0 until entities.size()) {
      val profile = Profile(id = i + startIDFrom, originalID = i + "", sourceId = sourceId)

      val entity = entities.get(i)
      val it = entity.getAttributes.iterator()
      while (it.hasNext) {
        val attribute =
        profile.addAttribute(KeyValue(attribute.getName, attribute.getValue))

      profiles.update(i, profile)
    }"SPARKER - Ended to loading profiles")"SPARKER - Start to parallelize profiles")
    val sc = SparkContext.getOrCreate()


  def loadGroundtruth(filePath: String): RDD[MatchingEntities] = {

    val groundtruth = DataLoaders.SerializedLoader.loadSerializedGroundtruth(filePath)

    val matchingEntitites: Array[MatchingEntities] = new Array(groundtruth.size())

    var i = 0

    val it = groundtruth.iterator
    while (it.hasNext) {
      val matching =
      matchingEntitites.update(i, MatchingEntities(matching.getEntityId1.toString, matching.getEntityId2.toString))
      i += 1

    val sc = SparkContext.getOrCreate()
Example 174
package Wrappers

import DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import scala.collection.mutable.MutableList

  def rowToAttributes(columnNames: Array[String], row: Row, explodeInnerFields: Boolean = false, innerSeparator: String = ","): MutableList[KeyValue] = {
    val attributes: MutableList[KeyValue] = new MutableList()
    for (i <- 0 to row.size - 1) {
      try {
        val value = row(i)
        val attributeKey = columnNames(i)

        if (value != null) {
          value match {
            case listOfAttributes: Iterable[Any] =>
              listOfAttributes map {
                attributeValue =>
                  attributes += KeyValue(attributeKey, attributeValue.toString)
            case stringAttribute: String =>
              if (explodeInnerFields) {
                stringAttribute.split(innerSeparator) map {
                  attributeValue =>
                    attributes += KeyValue(attributeKey, attributeValue)
              else {
                attributes += KeyValue(attributeKey, stringAttribute)
            case singleAttribute =>
              attributes += KeyValue(attributeKey, singleAttribute.toString)
      catch {
        case e: Throwable => println(e)
Source File: SerializedProfilesLoader.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Wrappers

import{IOException, _}

import DataStructures.Profile
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

  def loadSerializedObject(fileName: String): Any = {
    var `object`: Any = null
    try {
      val file: InputStream = new FileInputStream(fileName)
      val buffer: InputStream = new BufferedInputStream(file)
      val input: ObjectInput = new ObjectInputStream(buffer)
      try {
        `object` = input.readObject
      } finally {
    catch {
      case cnfEx: ClassNotFoundException => {
      case ioex: IOException => {
    return `object`
Source File: Converters.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Utilities

import BlockBuildingMethods.TokenBlocking
import org.apache.spark.rdd.RDD
import DataStructures._
import org.apache.spark.partial.PartialResult

  def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorIDs.isEmpty) {
          BlockDirty(blockID, Array(profilesID))
        else {
          BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs))

    blocks.filter(_.getComparisonSize() >= 1).map(x => x)

Source File: BlockFiltering.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package BlockRefinementMethods

import DataStructures.{BlockWithComparisonSize, ProfileBlocks}
import Utilities.BoundedPriorityQueue
import org.apache.log4j.LogManager
import org.apache.spark.rdd.RDD


  def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = {
    profilesWithBlocks map {
      profileWithBlocks =>
        val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons)
        val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt
        val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons
        ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet)
Source File: SerializedObjectLoader.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Wrappers

import DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

object SerializedObjectLoader extends  WrapperTrait{

  def loadProfiles(filePath : String, startIDFrom : Long = 0, realFieldID : String = "") : RDD[Profile] = {
    @transient lazy val log = org.apache.log4j.LogManager.getRootLogger"SPARKER - Start to loading entities")
    val entities = DataLoaders.SerializedLoader.loadSerializedDataset(filePath)"SPARKER - Loading ended")"SPARKER - Start to generate profiles")
    val profiles : Array[Profile] = new Array(entities.size())

    for(i <- 0 to entities.size()-1){
      val profile = Profile(id = i+startIDFrom, originalID = i+"")

      val entity = entities.get(i)
      val it = entity.getAttributes.iterator()
        val attribute =
        profile.addAttribute(KeyValue(attribute.getName, attribute.getValue))

      profiles.update(i, profile)
    }"SPARKER - Ended to loading profiles")"SPARKER - Start to parallelize profiles")
    val sc = SparkContext.getOrCreate()


  def loadGroundtruth(filePath : String) : RDD[MatchingEntities] = {

    val groundtruth = DataLoaders.SerializedLoader.loadSerializedGroundtruth(filePath)

    val matchingEntitites : Array[MatchingEntities] = new Array(groundtruth.size())

    var i = 0

    val it = groundtruth.iterator
      val matching =
      matchingEntitites.update(i, MatchingEntities(matching.getEntityId1.toString, matching.getEntityId2.toString))

    val sc = SparkContext.getOrCreate()
Source File: WrapperTrait.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Wrappers

import DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import scala.collection.mutable.MutableList

  def rowToAttributes(columnNames : Array[String], row : Row, explodeInnerFields:Boolean = false, innerSeparator : String = ",") : MutableList[KeyValue] = {
    val attributes: MutableList[KeyValue] = new MutableList()
    for(i <- 0 to row.size-1){
        val value = row(i)
        val attributeKey = columnNames(i)

        if(value != null){
          value match {
            case listOfAttributes : Iterable[Any] =>
              listOfAttributes map {
                attributeValue =>
                  attributes += KeyValue(attributeKey, attributeValue.toString)
            case stringAttribute : String =>
                stringAttribute.split(innerSeparator) map {
                  attributeValue =>
                    attributes += KeyValue(attributeKey, attributeValue)
              else {
                attributes += KeyValue(attributeKey, stringAttribute)
            case singleAttribute =>
              attributes += KeyValue(attributeKey, singleAttribute.toString)
        case e : Throwable => println(e)
Source File: SerializedProfilesLoader.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Wrappers

import{IOException, _}

import DataStructures.Profile
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

  def loadSerializedObject(fileName: String): Any = {
    var `object`: Any = null
    try {
      val file: InputStream = new FileInputStream(fileName)
      val buffer: InputStream = new BufferedInputStream(file)
      val input: ObjectInput = new ObjectInputStream(buffer)
      try {
        `object` = input.readObject
      } finally {
    catch {
      case cnfEx: ClassNotFoundException => {
      case ioex: IOException => {
    return `object`
Source File: Converters.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Utilities

import org.apache.spark.rdd.RDD
import DataStructures._
import org.apache.spark.partial.PartialResult

  def profilesBlockToBlocks(profilesBlocks : RDD[ProfileBlocks], separatorID : Long = -1) : RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorID < 0){
          BlockDirty(blockID, (profilesID, Set.empty))
          BlockClean(blockID, (profilesID.partition(_ <= separatorID)))

    blocks.filter(_.getComparisonSize() >=1).map(x => x)

Source File: BlockFiltering.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package BlockRefinementMethods

import DataStructures.{BlockWithComparisonSize, ProfileBlocks}
import Utilities.BoundedPriorityQueue
import org.apache.log4j.LogManager
import org.apache.spark.rdd.RDD


  def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = {
    profilesWithBlocks map {
      profileWithBlocks =>
        val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons)
        val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt
        val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons
        ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet)
Source File: CNNModel.scala    From SparkMLlibDeepLearn   with Apache License 2.0 5 votes vote down vote up
package CNN

import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV
import org.apache.spark.rdd.RDD

  def Loss(predict: RDD[PredictCNNLabel]): Double = {
    val predict1 = => f.error)
    // error and loss
    // ���������
    val loss1 = predict1
    val (loss2, counte) = loss1.treeAggregate((0.0, 0L))(
      seqOp = (c, v) => {
        // c: (e, count), v: (m)
        val e1 = c._1
        val e2 = (v :* v).sum
        val esum = e1 + e2
        (esum, c._2 + 1)
      combOp = (c1, c2) => {
        // c: (e, count)
        val e1 = c1._1
        val e2 = c2._1
        val esum = e1 + e2
        (esum, c1._2 + c2._2)
    val Loss = (loss2 / counte.toDouble) * 0.5

Example 184
Source File: NeuralNetModel.scala    From SparkMLlibDeepLearn   with Apache License 2.0 5 votes vote down vote up
package NN

import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV
import org.apache.spark.rdd.RDD

  def Loss(predict: RDD[PredictNNLabel]): Double = {
    val predict1 = => f.error)
    // error and loss
    // ���������
    val loss1 = predict1
    val (loss2, counte) = loss1.treeAggregate((0.0, 0L))(
      seqOp = (c, v) => {
        // c: (e, count), v: (m)
        val e1 = c._1
        val e2 = (v :* v).sum
        val esum = e1 + e2
        (esum, c._2 + 1)
      combOp = (c1, c2) => {
        // c: (e, count)
        val e1 = c1._1
        val e2 = c2._1
        val esum = e1 + e2
        (esum, c1._2 + c2._2)
    val Loss = loss2 / counte.toDouble
    Loss * 0.5

Source File: DBNModel.scala    From SparkMLlibDeepLearn   with Apache License 2.0 5 votes vote down vote up
package DBN

import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV
import org.apache.spark.rdd.RDD
import scala.collection.mutable.ArrayBuffer

class DBNModel(
  val config: DBNConfig,
  val dbn_W: Array[BDM[Double]],
  val dbn_b: Array[BDM[Double]],
  val dbn_c: Array[BDM[Double]]) extends Serializable {

  def dbnunfoldtonn(outputsize: Int): (Array[Int], Int, Array[BDM[Double]]) = {
    //1 size layer ����ת��
    val size = if (outputsize > 0) {
      val size1 = config.size
      val size2 = ArrayBuffer[Int]()
      size2 ++= size1
      size2 += outputsize
    } else config.size
    val layer = if (outputsize > 0) config.layer + 1 else config.layer
    //2 dbn_W ����ת��
    var initW = ArrayBuffer[BDM[Double]]()
    for (i <- 0 to dbn_W.length - 1) {
      initW += BDM.horzcat(dbn_c(i), dbn_W(i))
    (size, layer, initW.toArray)

Source File: StringKeyRDD.scala    From cuesheet   with Apache License 2.0 5 votes vote down vote up
package com.kakao.cuesheet.convert

import java.nio.charset.StandardCharsets.UTF_8

import org.apache.spark.rdd.RDD

import scala.concurrent.duration._

class StringKeyRDD[T](rdd: RDD[(String, T)]) extends SaveToES(rdd) {

  def saveToCouchbase(nodes: Seq[String], bucket: String, expiry: Int = 0, maxRate: Double = 1e7, password: String = null): Unit = {
    // rate per executor
    val rate = maxRate / rdd.sparkContext.getExecutorMemoryStatus.size

    rdd.foreachPartition { partition =>
      // BackPressureException may happen, so retry 10 times
      // if that fails, Spark task scheduler may retry again.
      val cluster = Couchbase(nodes: _*)
      val client = cluster.bucket(bucket, password)

      val converted = {
        case (key, value: Array[Byte]) => (key, new String(value, UTF_8))
        case (key, value: String) => (key, value)
        case (key, value) => (key, toJson(value))

      for (group <- converted.grouped(1000)) {
        Retry(10, 100.millis) {
          client.putAll(group, rate, expiry).sync()


  def saveToHBase(quorum: String, table: String, family: String, qualifier: String, maxRate: Double = 1e7): Unit = {
    // rate per executor
    val rate = maxRate / rdd.sparkContext.getExecutorMemoryStatus.size

    rdd.foreachPartition { partition =>
      val hbase = HBase(quorum)
      val column = hbase.column(table, family, qualifier)

      val converted = {
        case (key, value: Array[Byte]) => (key.getBytes(UTF_8), value)
        case (key, value: String) => (key.getBytes(UTF_8), value.getBytes(UTF_8))
        case (key, value) => (key.getBytes(UTF_8), serialize(value))

      for (group <- converted.grouped(1000)) {
        Retry(10, 100.millis) {
          column.putAllBytes(group, rate).sync()
Example 187
package com.kakao.cuesheet.convert

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.collection.JavaConversions._

trait HBaseReaders {
  val sc: SparkContext

  def hbaseTable(quorum: String, table: String): RDD[(String, ((String, String), (Long, String)))] = {
    hbaseTableBinary(quorum, table).map {
      case (rowkey, ((family, qualifier), (timestamp, value))) =>
        (rowkey.string, ((family.string, qualifier.string), (timestamp, value.string)))

  def hbaseColumnBinary(quorum: String, table: String, family: Array[Byte], qualifier: Array[Byte]): RDD[(Array[Byte], (Long, Array[Byte]))] = {
    hbaseTableBinary(quorum, table).collect {
      case (rowkey, ((f, q), cell)) if family.sameElements(f) && qualifier.sameElements(q) => (rowkey, cell)

  def hbaseColumn(quorum: String, table: String, family: String, qualifier: String): RDD[(String, (Long, String))] = {
    hbaseTable(quorum, table).collect {
      case (rowkey, ((f, q), cell)) if family == f && qualifier == q => (rowkey, cell)
Source File: JoinableRDD.scala    From cuesheet   with Apache License 2.0 5 votes vote down vote up
package com.kakao.cuesheet.convert

import org.apache.spark.HashPartitioner
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

class JoinableRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) {

  def selfJoin(numPartitions: Int = rdd.partitions.length): RDD[(K, (V, V))] = fastJoin(rdd, numPartitions)

  def fastJoin[W](other: RDD[(K, W)], numPartitions: Int = rdd.partitions.length): RDD[(K, (V, W))] = {
    val partitioner = new HashPartitioner(numPartitions)
    val grouped = rdd cogroup other

    val left = grouped.flatMap{
      case (k, (vs, ws)) => {
        case (v, idx) => ((k, idx), v)

    val right = grouped.flatMap {
      case (k, (vs, ws)) => { w => ((k, w.hashCode()), (w, vs.size)) }
    }.partitionBy(partitioner).flatMap {
      case ((k, r), (w, size)) => (0 until size).map(i => ((k, w), i))
    }.map {
      case ((k, w), idx) => ((k, idx), w)

    (left join right).map {
      case ((k, idx), (v, w)) => (k, (v, w))

Source File: SavingStream.scala    From cuesheet   with Apache License 2.0 5 votes vote down vote up
package com.kakao.cuesheet.convert

import{NamedExecutors, RichExecutorService}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, DataFrame}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream

import java.util.concurrent.{Future => JFuture}
import scala.reflect.runtime.universe.TypeTag

object SavingStream {
  val yyyyMMdd = ThreadSafeDateFormat("yyyy-MM-dd")
  val hh = ThreadSafeDateFormat("HH")
  val mm = ThreadSafeDateFormat("mm")
  val m0 = (ms: Long) => mm(ms).charAt(0) + "0"

  @transient var executor: RichExecutorService = _

  def ex: RichExecutorService = {
    if (executor == null) {
      this.synchronized {
        if (executor == null) {
          executor = new RichExecutorService(es.get())

  def saveAsPartitionedTable(table: String, path: String, format: String = "orc")(toPartition: Time => Seq[(String, String)]): Unit = {
    stream.foreachRDD { (rdd, time) =>
      ex.submit {
        toDF(rdd).appendToExternalTablePartition(table, path, format, toPartition(time): _*)

  def saveAsDailyPartitionedTable(table: String, path: String, dateColumn: String = "date", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms))

  def saveAsHourlyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms))

  def saveAsTenMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> m0(ms))

  def saveAsMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> mm(ms))


class ProductStream[T <: Product : TypeTag](stream: DStream[T])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[T](stream) {
  override def toDF(rdd: RDD[T]) = ctx.createDataFrame(rdd)

class JsonStream(stream: DStream[String])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[String](stream) {
  override def toDF(rdd: RDD[String]) =

class MapStream[T](stream: DStream[Map[String, T]])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[Map[String, T]](stream) {

  override def toDF(rdd: RDD[Map[String, T]]) =

class RowStream(stream: DStream[Row])(implicit ctx: HiveContext, es: ExecutorSupplier, schema: StructType) extends SavingStream[Row](stream) {
  override def toDF(rdd: RDD[Row]): DataFrame = ctx.createDataFrame(rdd, schema)
Source File: MemsqlRDD.scala    From memsql-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.memsql.spark

import java.sql.{Connection, PreparedStatement, ResultSet}

import com.memsql.spark.SQLGen.VariableList
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils}
import org.apache.spark.sql.types._
import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}

case class MemsqlRDD(query: String,
                     variables: VariableList,
                     options: MemsqlOptions,
                     schema: StructType,
                     expectedOutput: Seq[Attribute],
                     @transient val sc: SparkContext)
    extends RDD[Row](sc, Nil) {

  override protected def getPartitions: Array[Partition] =
    MemsqlQueryHelpers.GetPartitions(options, query, variables)

  override def compute(rawPartition: Partition, context: TaskContext): Iterator[Row] = {
    var closed                     = false
    var rs: ResultSet              = null
    var stmt: PreparedStatement    = null
    var conn: Connection           = null
    var partition: MemsqlPartition = rawPartition.asInstanceOf[MemsqlPartition]

    def tryClose(name: String, what: AutoCloseable): Unit = {
      try {
        if (what != null) { what.close() }
      } catch {
        case e: Exception => logWarning(s"Exception closing $name", e)

    def close(): Unit = {
      if (closed) { return }
      tryClose("resultset", rs)
      tryClose("statement", stmt)
      tryClose("connection", conn)
      closed = true

    context.addTaskCompletionListener { context =>

    conn = JdbcUtils.createConnectionFactory(partition.connectionInfo)()
    stmt = conn.prepareStatement(partition.query)
    JdbcHelpers.fillStatement(stmt, partition.variables)
    rs = stmt.executeQuery()

    var rowsIter = JdbcUtils.resultSetToRows(rs, schema)

    if (expectedOutput.nonEmpty) {
      val schemaDatatypes   =
      val expectedDatatypes =

      if (schemaDatatypes != expectedDatatypes) {
        val columnEncoders = {
          case ((_: StringType, _: NullType), _)     => ((_: Row) => null)
          case ((_: ShortType, _: BooleanType), i)   => ((r: Row) => r.getShort(i) != 0)
          case ((_: IntegerType, _: BooleanType), i) => ((r: Row) => r.getInt(i) != 0)
          case ((_: LongType, _: BooleanType), i)    => ((r: Row) => r.getLong(i) != 0)

          case ((l, r), i) => {
            options.assert(l == r, s"MemsqlRDD: unable to encode ${l} into ${r}")
            ((r: Row) => r.get(i))

        rowsIter = rowsIter
          .map(row => Row.fromSeq(

    CompletionIterator[Row, Iterator[Row]](new InterruptibleIterator[Row](context, rowsIter), close)

Source File: KMeanTest.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector}

import scala.util.Random

//spark/bin/spark-submit --master spark:// --class  ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9

//guale spark/bin/spark-submit --master spark:// --class  ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15

object ScalableKMeanTest {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}")
    val sc = new SparkContext(conf)

    val k = args(0).toInt
    val dimension = args(1).toInt
    val recordNum = args(2).toInt
    val sparsity = args(3).toDouble
    val iterations = args(4).toInt
    val means = args(5)
    val parNumber = args(6).toInt

    val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => {
      val ran = new Random()
      val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray
      val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray
      val vec: Vector = new SparseVector(dimension, indexArr, valueArr)
    println(args.mkString(", "))
    println(data.count() + " records generated")

    val st = System.nanoTime()

    val model = if(means == "my") {
      println("running scalable kmeans")
      val model = new ScalableKMeans()
    } else {
      println("running mllib kmeans")
      val model = new KMeans()

    println((System.nanoTime() - st) / 1e9 + " seconds cost")
    println( => v.numNonzeros).mkString("\n"))


Example 192
Source File: LRUtils.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.sparselr.Utils

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

object LRUtils {
  def bytes2Int (buffer: Array[Byte], pos: Int): (Int, Int) = {
    var result: Int = 0
    var position: Int = pos
    var byte = buffer(pos)
    var shiftNum = 0

    while ((byte & 0x80) != 0) {
      result = result | ((byte & 0x7F)<<shiftNum)
      position += 1
      byte = buffer(position)
      shiftNum += 7
    result = result | ((byte & 0x7F)<<shiftNum)
    (result, position)

  //featureId cached in X is localId
  def loadFileAsMatrix(
                sc: SparkContext,
                path: String,
                minPartitions: Int): RDD[(Array[Double], Matrix)] = {
    val lines = sc.textFile(path, minPartitions)
      .filter(line => !(line.isEmpty || line.startsWith("#")))

    val data = lines.mapPartitions { samples =>
      val labels = new PrimitiveVector[Double]()
      val builder = new MatrixBuilder()

      samples.foreach { line =>
        val items = line.split(' ')

        labels += items.head.toDouble

        val featureIdAndValues = items.tail.filter(_.nonEmpty)

        val indices = new PrimitiveVector[Int]()
        val values = new PrimitiveVector[Float]()
        featureIdAndValues.foreach { item =>
          val featureAndValue = item.split(":")
          indices += featureAndValue(0).toInt
          val value = featureAndValue(1).toFloat
          values += value
        builder.add(new SparseVector(indices.trim.array, values.trim.array))
      Iterator((labels.trim.array, builder.toMatrix))
Source File: LogisticRegression.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.sparselr

import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap
import org.apache.spark.mllib.sparselr.Utils._
import org.apache.spark.SparkEnv
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

object LogisticRegression {
    def train(input: RDD[(Array[Double], Matrix)],
              optimizer: Optimizer
              ): (Array[Int], Array[Double]) = {

      val hdfsIndex2global = new Int2IntOpenHashMap()
      var index = 0 { point =>
        point._2 match {
          case x: CompressedSparseMatrix =>
            println("x.length" + x.mappings.length)
          case _ =>
            throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.")

      val global2hdfsIndex = { point =>
        point._2 match {
          case x: CompressedSparseMatrix =>
          case _ =>
            throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.")
      }.collect().flatMap(t => t).distinct

      global2hdfsIndex.foreach{value =>
        hdfsIndex2global.put(value, index)
        index += 1

      val bcHdfsIndex2global = input.context.broadcast(hdfsIndex2global)

      val examples =

      val numTraining = examples.count()
      println(s"Training: $numTraining.")

      SparkEnv.get.blockManager.removeBroadcast(, true)

      val examplesTest = examples.mapPartitions(_.flatMap {
        case (y, part) => part.asInstanceOf[CompressedSparseMatrix].tupletIterator(y)})

      val weights = Vectors.dense(new Array[Double](global2hdfsIndex.size))

      val newWeights = optimizer.optimize(examplesTest, weights)

      ((global2hdfsIndex, newWeights.toArray))

    def global2globalMapping(bchdfsIndex2global: Broadcast[Int2IntOpenHashMap])
                     (partition: (Array[Double], Matrix)): (Array[Double], Matrix) = {
      val hdfsIndex2global = bchdfsIndex2global.value

      partition._2 match {
        case x: CompressedSparseMatrix =>
          val local2hdfsIndex = x.mappings
          for (i <- 0 until local2hdfsIndex.length) {
            local2hdfsIndex(i) = hdfsIndex2global.get(local2hdfsIndex(i))
        case _ =>
          throw new IllegalArgumentException(s"dot doesn't support ${partition.getClass}.")
Example 194
Source File: OneWayANOVA.scala    From StatisticsOnSpark   with Apache License 2.0 5 votes vote down vote up
package main.ANOVA

import org.apache.commons.math3.distribution.FDistribution
import org.apache.spark.rdd.RDD

  def anovaPValue(categoryData: Iterable[RDD[Double]]): Double = {
    val anovaStats = getAnovaStats(categoryData)

    val fdist: FDistribution = new FDistribution(null, anovaStats.dfbg, anovaStats.dfwg)
    return 1.0 - fdist.cumulativeProbability(anovaStats.F)

  private case class ANOVAStats(dfbg: Double, dfwg: Double, F: Double)

  private def getAnovaStats(categoryData: Iterable[RDD[Double]]): ANOVAStats = {
    var dfwg: Long = 0
    var sswg: Double = 0
    var totsum: Double = 0
    var totsumsq: Double = 0
    var totnum: Long = 0

    for (data <- categoryData) {
      val sum: Double = data.sum()
      val sumsq: Double = => i * i).sum()
      val num = data.count()
      totnum += num
      totsum += sum
      totsumsq += sumsq
      dfwg += num - 1
      val ss: Double = sumsq - ((sum * sum) / num)
      sswg += ss

    val sst: Double = totsumsq - ((totsum * totsum) / totnum)
    val ssbg: Double = sst - sswg
    val dfbg: Int = categoryData.size - 1
    val msbg: Double = ssbg / dfbg
    val mswg: Double = sswg / dfwg
    val F: Double = msbg / mswg
    ANOVAStats(dfbg, dfwg, F)

Source File: TwoSampleIndependentTTest.scala    From StatisticsOnSpark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.TDistribution
import org.apache.commons.math3.util.FastMath
import org.apache.spark.rdd.RDD

  def tTest(sample1: RDD[Double], sample2: RDD[Double]): Double = {
    val n1 = sample1.count()
    val n2 = sample2.count()
    val m1 = sample1.sum() / n1
    val m2 = sample2.sum() / n2
    val v1 = => (d - m1) * (d - m1)).sum() / (n1 - 1)
    val v2 = => (d - m2) * (d - m2)).sum() / (n2 - 1)
    val t: Double = math.abs((m1 - m2) / FastMath.sqrt((v1 / n1) + (v2 / n2)))
    val degreesOfFreedom: Double = (((v1 / n1) + (v2 / n2)) * ((v1 / n1) + (v2 / n2))) /
      ((v1 * v1) / (n1 * n1 * (n1 - 1d)) + (v2 * v2) / (n2 * n2 * (n2 - 1d)))

    // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
    val distribution: TDistribution = new TDistribution(null, degreesOfFreedom)
    2.0 * distribution.cumulativeProbability(-t)

Source File: EtlProcessor.scala    From etl-light   with MIT License 5 votes vote down vote up
package yamrcraft.etlite.processors

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.DefaultDecoder
import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.kafka._
import org.slf4j.LoggerFactory
import yamrcraft.etlite.Settings
import yamrcraft.etlite.state.{KafkaOffsetsState, KafkaStateManager}
import yamrcraft.etlite.transformers.InboundMessage

object EtlProcessor {

  val logger = LoggerFactory.getLogger(this.getClass)

  def run(settings: Settings) = {
    val context = createContext(settings)

    val stateManager = new KafkaStateManager(settings.etl.state)

    val lastState = stateManager.readState"last persisted state: $lastState")

    val currState = stateManager.fetchNextState(lastState, settings)"batch working state: $currState")

    val rdd = createRDD(context, currState, settings)
    processRDD(rdd, currState.jobId, settings)"committing state")

  private def createContext(settings: Settings) = {
    val sparkConf = new SparkConf()

    new SparkContext(sparkConf)

  private def createRDD(context: SparkContext, state: KafkaOffsetsState, settings: Settings): RDD[InboundMessage] = {
    KafkaUtils.createRDD[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder, InboundMessage](
      Map[TopicAndPartition, Broker](),
      (msgAndMeta: MessageAndMetadata[Array[Byte], Array[Byte]]) => { InboundMessage(msgAndMeta.topic, msgAndMeta.key(), msgAndMeta.message()) }

  private def processRDD(kafkaRDD: RDD[InboundMessage], jobId: Long, settings: Settings) = {
    // passed to remote workers
    val etlSettings = settings.etl"RDD processing started [rdd=${}, jobId=$jobId]")

    val rdd =

    rdd.foreachPartition { partition =>
        // executed at the worker
        new PartitionProcessor(jobId, TaskContext.get.partitionId(), etlSettings)
      }"RDD processing ended [rdd=${}, jobId=$jobId]")

Source File: YahooParser.scala    From spark-timeseries   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.sparkts.parsers

import com.cloudera.sparkts.TimeSeries
import com.cloudera.sparkts.TimeSeries._
import java.time._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

object YahooParser {
  def yahooStringToTimeSeries(
    text: String,
    keyPrefix: String = "",
    zone: ZoneId = ZoneId.systemDefault())
    : TimeSeries[String] = {
    val lines = text.split('\n')
    val labels = lines(0).split(',') + _)
    val samples = { line =>
      val tokens = line.split(',')
      val dt = LocalDate.parse(tokens.head).atStartOfDay(zone)
    timeSeriesFromIrregularSamples(samples, labels, zone)

  def yahooFiles(
    dir: String,
    sc: SparkContext,
    zone: ZoneId = ZoneId.systemDefault())
    : RDD[TimeSeries[String]] = {
    sc.wholeTextFiles(dir).map { case (path, text) =>
      YahooParser.yahooStringToTimeSeries(text, path.split('/').last, zone)
Source File: DatasourceRDD.scala    From datasource-receiver   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.datasource.receiver

import org.apache.spark.partial.{BoundedDouble, CountEvaluator, PartialResult}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.streaming.datasource.config.ParametersUtils
import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetOperator}
import org.apache.spark.{Logging, Partition, TaskContext}

class DatasourceRDD(
                     @transient sqlContext: SQLContext,
                     inputSentences: InputSentences,
                     datasourceParams: Map[String, String]
                   ) extends RDD[Row](sqlContext.sparkContext, Nil) with Logging with ParametersUtils {

  private var totalCalculated: Option[Long] = None

  private val InitTableName = "initTable"
  private val LimitedTableName = "limitedTable"
  private val TempInitQuery = s"select * from $InitTableName"

  val dataFrame = inputSentences.offsetConditions.fold(sqlContext.sql(inputSentences.query)) { case offset =>
    val parsedQuery = parseInitialQuery
    val conditionsSentence = offset.fromOffset.extractConditionSentence(parsedQuery)
    val orderSentence = offset.fromOffset.extractOrderSentence(parsedQuery, inverse = offset.limitRecords.isEmpty)
    val limitSentence = inputSentences.extractLimitSentence

    sqlContext.sql(parsedQuery + conditionsSentence + orderSentence + limitSentence)

  private def parseInitialQuery: String = {
    if (inputSentences.query.toUpperCase.contains("WHERE") ||
      inputSentences.query.toUpperCase.contains("ORDER") ||
    ) {
    } else inputSentences.query

  def progressInputSentences: InputSentences = {
    if (!dataFrame.rdd.isEmpty()) {
      inputSentences.offsetConditions.fold(inputSentences) { case offset =>

        val offsetValue = if (offset.limitRecords.isEmpty)
        else {
          val limitedQuery = s"select * from $LimitedTableName order by ${} " +
            s"${OffsetOperator.toInverseOrderOperator(offset.fromOffset.operator)} limit 1"


        inputSentences.copy(offsetConditions = Option(offset.copy(fromOffset = offset.fromOffset.copy(
          value = Option(offsetValue),
          operator = OffsetOperator.toProgressOperator(offset.fromOffset.operator)))))
    } else inputSentences

  override def isEmpty(): Boolean = {
    totalCalculated.fold {
      withScope {
        partitions.length == 0 || take(1).length == 0
    } { total => total == 0L }

  override def getPartitions: Array[Partition] = dataFrame.rdd.partitions

  override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = dataFrame.rdd.compute(thePart, context)

  override def getPreferredLocations(thePart: Partition): Seq[String] = dataFrame.rdd.preferredLocations(thePart)
Source File: JsonInputStreamQuery.scala    From spark-cep   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming.examples

import scala.collection.mutable.SynchronizedQueue

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.streaming.StreamSQLContext
import org.apache.spark.streaming.{Duration, StreamingContext}

object JsonInputStreamQuery {
  def main(args: Array[String]): Unit = {
    val ssc = new StreamingContext("local[10]", "test", Duration(3000))
    val sc = ssc.sparkContext
    val streamSqlContext = new StreamSQLContext(ssc, new SQLContext(sc))
    import streamSqlContext._
    // Here we read data line by line from a given file and then put it into a queue DStream.
    // You can replace any kind of String type DStream here including kafka DStream.
    val queue = new SynchronizedQueue[RDD[String]]()
    Source.fromFile("src/main/resources/student.json").getLines().foreach(msg =>
    val queueDStream = ssc.queueStream[String](queue)
    // We can infer the schema of json automatically by using inferJsonSchema
    val schema = streamSqlContext.inferJsonSchema("src/main/resources/student.json")
      streamSqlContext.jsonDStream(queueDStream, schema), "jsonTable")
    sql("SELECT * FROM jsonTable").print()
    ssc.awaitTerminationOrTimeout(30 * 1000)
Example 200
Source File: ExistingDStream.scala    From spark-cep   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming

import org.apache.spark.rdd.{EmptyRDD, RDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream

case class PhysicalDStream(output: Seq[Attribute], @transient stream: DStream[InternalRow])
    extends SparkPlan with StreamPlan {

  def children = Nil

  override def doExecute() = {
    assert(validTime != null)
    Utils.invoke(classOf[DStream[InternalRow]], stream, "getOrCompute", (classOf[Time], validTime))
      .getOrElse(new EmptyRDD[InternalRow](sparkContext))