org.apache.spark.rdd.RDD Scala Examples

The following examples show how to use org.apache.spark.rdd.RDD. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: DeltaQA.scala    From spark-tools   with Apache License 2.0 10 votes vote down vote up
package io.univalence.deltaqa.kpialgebra

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import shapeless.contrib.spire._
import spire.algebra._
import spire.implicits._

import scala.reflect.ClassTag

case class DeltaPart[T: AdditiveMonoid](
  count: Long,
  part: T
)

case class DeltaCommon[T: AdditiveMonoid](
  count: Long,
  countZero: Long,
  diff: T,
  error: T,
  left: T,
  right: T
)

case class Delta[L: AdditiveMonoid, R: AdditiveMonoid, C: AdditiveMonoid](
  left: DeltaPart[L],
  right: DeltaPart[R],
  common: DeltaCommon[C]
)

object KpiAlgebra {

  def computeCommon[LRC: AdditiveAbGroup: MultiplicativeSemigroup](left: LRC, right: LRC): DeltaCommon[LRC] = {
    val diff  = left - right
    val error = diff * diff
    DeltaCommon(
      count     = 1,
      countZero = if (diff == Monoid.additive[LRC].id) 1 else 0,
      diff      = diff,
      error     = error,
      left      = left,
      right     = right
    )
  }

  def monoid[LM: AdditiveMonoid, RM: AdditiveMonoid, LRC: AdditiveMonoid]: Monoid[Delta[LM, RM, LRC]] =
    Monoid.additive[Delta[LM, RM, LRC]]

  def compare[
    K: ClassTag,
    L: ClassTag,
    R: ClassTag,
    LM: AdditiveMonoid: ClassTag,
    RM: AdditiveMonoid: ClassTag,
    LRC: AdditiveAbGroup: MultiplicativeSemigroup: ClassTag
  ](
    left: RDD[(K, L)],
    right: RDD[(K, R)]
  )(flm: L => LM, frm: R => RM, flc: L => LRC, frc: R => LRC): Delta[LM, RM, LRC] = {

    val map: RDD[Delta[LM, RM, LRC]] = left
      .fullOuterJoin(right)
      .map({
        case (_, (Some(l), None)) =>
          monoid[LM, RM, LRC].id
            .copy(left = DeltaPart(count = 1, part = flm(l)))
        case (_, (None, Some(r))) =>
          monoid[LM, RM, LRC].id
            .copy(right = DeltaPart(count = 1, part = frm(r)))
        case (_, (Some(l), Some(r))) =>
          monoid[LM, RM, LRC].id.copy(common = computeCommon(flc(l), frc(r)))
      })

    map.reduce((x, y) => monoid[LM, RM, LRC].op(x, y))
  }
}

case class KpiLeaf(l1: Long, l2: Long, l3: Long)

object KpiAlgebraTest {

  def main(args: Array[String]) {
    val sc = new SparkContext(new SparkConf().setMaster("local[*]").setAppName("smoketest"))

    val parallelize: RDD[(Int, Int)] = sc.parallelize((1 to 4).zipWithIndex)

    

    // Delta(DeltaPart(0,0),DeltaPart(0,0),DeltaCommon(4,4,0,0,6,6))

    val p2: RDD[(Int, KpiLeaf)] =
      sc.parallelize((1 to 4)).map(_ -> KpiLeaf(1, 2, 3))

    import spire.implicits._
    import shapeless.contrib.spire._

    ////println(((KpiAlgebra.compare(p2, p2)(identity, identity, identity, identity))

  }
} 
Example 2
Source File: Test1.scala    From BigData-News   with Apache License 2.0 7 votes vote down vote up
package com.vita.spark.test

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Test1 {
  def main(args: Array[String]): Unit = {

    val conf: SparkConf = new SparkConf()
    conf.setMaster("local")
    conf.setAppName("TransformationOperator")
    val sc: SparkContext = new SparkContext(conf)
    val list: List[String] = List("张无忌", "赵敏", "周芷若")
    val rdd: RDD[String] = sc.parallelize(list)


    val list1: List[(Int, String)] = List((1, "东方不败"), (2, "令狐冲"), (3, "林平之"))
    val list2: List[(Int, Int)] = List((1, 99), (2, 98), (3, 97))

    val rdd1: RDD[(Int, String)] = sc.parallelize(list1)
    val rdd2: RDD[(Int, Int)] = sc.parallelize(list2)
    rdd1.join(rdd2).foreach(x => println("学号: " + x._1 + "名字:" + x._2._1 + " 分数:" + x._2._2))

  }
} 
Example 3
Source File: SqlNetworkWordCount.scala    From drizzle-spark   with Apache License 2.0 6 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming

import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext, Time}


object SparkSessionSingleton {

  @transient  private var instance: SparkSession = _

  def getInstance(sparkConf: SparkConf): SparkSession = {
    if (instance == null) {
      instance = SparkSession
        .builder
        .config(sparkConf)
        .getOrCreate()
    }
    instance
  }
}
// scalastyle:on println 
Example 4
Source File: LocalTableScanExec.scala    From drizzle-spark   with Apache License 2.0 6 votes vote down vote up
package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.execution.metric.SQLMetrics



case class LocalTableScanExec(
    output: Seq[Attribute],
    rows: Seq[InternalRow]) extends LeafExecNode {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  private val unsafeRows: Array[InternalRow] = {
    if (rows.isEmpty) {
      Array.empty
    } else {
      val proj = UnsafeProjection.create(output, output)
      rows.map(r => proj(r).copy()).toArray
    }
  }

  private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1),
    sqlContext.sparkContext.defaultParallelism)

  private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism)

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    rdd.map { r =>
      numOutputRows += 1
      r
    }
  }

  override protected def stringArgs: Iterator[Any] = {
    if (rows.isEmpty) {
      Iterator("<empty>", output)
    } else {
      Iterator(output)
    }
  }

  override def executeCollect(): Array[InternalRow] = {
    longMetric("numOutputRows").add(unsafeRows.size)
    unsafeRows
  }

  override def executeTake(limit: Int): Array[InternalRow] = {
    val taken = unsafeRows.take(limit)
    longMetric("numOutputRows").add(taken.size)
    taken
  }
} 
Example 5
Source File: GraphGeneration.scala    From Mastering-Machine-Learning-with-Spark-2.x   with MIT License 6 votes vote down vote up
package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.lib.TriangleCount
import org.apache.spark.graphx.util.GraphGenerators
import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}


object GraphGeneration extends App {

  val conf = new SparkConf()
    .setAppName("Graph generation")
    .setMaster("local[4]")
  val sc = new SparkContext(conf)

  val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt")

  val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map {
    line =>
      val field = line.split(" ")
      (field(0).toLong, field(1).toLong)
  }
  val edgeTupleGraph = Graph.fromEdgeTuples(
    rawEdges=rawEdges, defaultValue="")

  val gridGraph = GraphGenerators.gridGraph(sc, 5, 5)
  val starGraph = GraphGenerators.starGraph(sc, 11)
  val logNormalGraph  = GraphGenerators.logNormalGraph(
    sc, numVertices = 20, mu=1, sigma = 3
  )
  logNormalGraph.outDegrees.map(_._2).collect().sorted

  val actorGraph = GraphLoader.edgeListFile(
    sc, "./ca-hollywood-2009.txt", true
  ).partitionBy(PartitionStrategy.RandomVertexCut)
  actorGraph.edges.count()

  val actorComponents = actorGraph.connectedComponents().cache
  actorComponents.vertices.map(_._2).distinct().count

  val clusterSizes =actorComponents.vertices.map(
    v => (v._2, 1)).reduceByKey(_ + _)
  clusterSizes.map(_._2).max
  clusterSizes.map(_._2).min

  val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt")
  val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5)
  strongComponents.vertices.map(_._2).distinct().count

  val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges()
  val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut)

  actorGraph.triangleCount()
  val triangles = TriangleCount.runPreCanonicalized(partitionedGraph)

  actorGraph.staticPageRank(10)
  val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001)
  actorPrGraph.vertices.reduce((v1, v2) => {
    if (v1._2 > v2._2) v1 else v2
  })

  actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println)

  actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10)

  actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count

} 
Example 6
Source File: PipePrintSampleCorpus.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.corpus

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable


class PipePrintSampleCorpus(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping)
  extends PipeElementPassthrough[RDD[Tuple]] with PipeSampler {

  def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: CorpusContext => {
        val sample: Array[Tuple] = pc.corpus.takeSample(false, count)
        val table: Seq[Seq[String]] = createTupleTable(sample)
        log.info("Corpus sample of " + sample.size + " tuples: ")
        Table.printTable(table)
      }
    }
  }

}

object PipePrintSampleCorpus {

  def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = {
    new PipePrintSampleCorpus(count)
  }

} 
Example 7
Source File: PipeContextReadCorpus.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.corpus

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.pipe.PipeElement
import scala.reflect.ClassTag

class PipeContextReadCorpus[A: ClassTag] extends PipeElement[RDD[A], RDD[Tuple]] {

  def step(input: RDD[A])(implicit pipeContext: AbstractPipeContext): RDD[Tuple] = {
    pipeContext match {
      case pc: CorpusContext => pc.corpus
    }
  }
}

object PipeContextReadCorpus {

  def apply[A]() = new PipeContextReadCorpus()

} 
Example 8
Source File: PipeAnalyseCorpus.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.corpus

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.TupleArray
import de.unihamburg.vsis.sddf.visualisation.model.ReadingModel
import de.unihamburg.vsis.sddf.pipe.context.ResultContext

class PipeAnalyseCorpus
  extends PipeElementPassthrough[RDD[Tuple]]
  with Serializable {

  override val _analysable = new ReadingModel

  def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = {
    _analysable.tuples_=(input)
    pipeContext match {
      case pc: ResultContext => {
        pc.readingModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseCorpus {
  def apply() = {
    new PipeAnalyseCorpus()
  }
} 
Example 9
Source File: PipeStoreInContextGoldstandard.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeStoreInContextGoldstandard extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {
  
  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext => pc.goldstandard = input
    }
  }
}

object PipeStoreInContextGoldstandard {
  
  def apply() = new PipeStoreInContextGoldstandard()

} 
Example 10
Source File: PipeReaderGoldstandardIdsPairs.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.IdConverterBasic
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable


class PipeReaderGoldstandardIdsPairs(
    separator: Char = ',',
    idIndex1: Int = 0,
    idIndex2: Int = 1,
    idConverter: IdConverter = IdConverterBasic)
  extends PipeElement[RDD[String], RDD[SymPair[Long]]] {

  override def step(inputRdd: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[SymPair[Long]] = {
    inputRdd.map(line => {
      val parts = line.split(separator)
      val tupleId1 = idConverter.convert(parts(idIndex1).replaceAll("[^0-9]",""))
      val tupleId2 = idConverter.convert(parts(idIndex2).replaceAll("[^0-9]",""))
      new SymPair(tupleId1, tupleId2)
    })
  }

}

object PipeReaderGoldstandardIdsPairs {
  
  def apply(
      separator: Char = ',',
      idIndex1: Int = 0,
      idIndex2: Int = 1,
      idConverter: IdConverter = IdConverterBasic) = {
    new PipeReaderGoldstandardIdsPairs(separator, idIndex1, idIndex2, idConverter)
  }

} 
Example 11
Source File: PipeReaderGoldstandard.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.Pipeline
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.IdConverterBasic
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

object PipeReaderGoldstandardPairs {

  def apply(
    separator: Char = ',',
    idIndex1: Int = 0,
    idIndex2: Int = 1,
    idConverter: IdConverter = IdConverterBasic): Pipeline[RDD[String], RDD[SymPair[Tuple]]] = {
    PipeReaderGoldstandardIdsPairs(separator, idIndex1, idIndex2, idConverter)
      .append(PipeReaderGoldstandardIdToTuple())
  }

}

object PipeReaderGoldstandardCluster {

  def apply(
      separator: Char = ',',
      clusterIdIndex: Int = 0,
      tupleIdIndex: Int = 1,
      idConverter: IdConverter = IdConverterBasic): Pipeline[RDD[String], RDD[SymPair[Tuple]]] = {
    PipeReaderGoldstandardIdsCluster(separator, clusterIdIndex, tupleIdIndex, idConverter)
      .append(PipeReaderGoldstandardIdToTuple())
  }

} 
Example 12
Source File: PipeAnalyseGoldstandardCluster.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.visualisation.model.GoldstandardClusterModel

class PipeAnalyseGoldstandardCluster extends PipeElementPassthrough[RDD[Seq[Long]]] {

  override val _analysable = new GoldstandardClusterModel

  def substep(input: RDD[Seq[Long]])(implicit pipeContext: AbstractPipeContext): Unit = {
    _analysable.goldstandard = input
    pipeContext match {
      case pc: ResultContext => {
        pc.goldstandardModelCluster = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseGoldstandardCluster {

  def apply() = new PipeAnalyseGoldstandardCluster()

} 
Example 13
Source File: PipePrintSampleGoldstandard.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipePrintSampleGoldstandard(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping)
  extends PipeElementPassthrough[RDD[Tuple]] with PipeSampler {

  def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext => {
        val sample: Array[SymPair[Tuple]] = pc.goldstandard.takeSample(false, count)
        val table: Seq[Seq[String]] = createSymPairTable(sample)
        
        log.info("Goldstandard sample of " + sample.size + " tuples: ")
        Table.printTable(table)
      }
    }
  }

}

object PipePrintSampleGoldstandard {
  
  def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = {
    new PipePrintSampleGoldstandard(count)
  }

} 
Example 14
Source File: PipeReaderGoldstandardClusterOutput.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import java.util.regex.PatternSyntaxException

import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.SddfContext.rddToRdd
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.IdConverterBasic
import de.unihamburg.vsis.sddf.reading.SymPair


class PipeReaderGoldstandardClusterOutput(
  separator: Char = ',',
  clusterIdIndex: Int = 0,
  tupleIdIndex: Int = 1,
  idConverter: IdConverter = IdConverterBasic)
  extends PipeElement[RDD[String], RDD[Seq[Long]]] {

  override def step(inputRdd: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Long]] = {
    // parse tuple ids
    val clusterIdTupleIdRdd = inputRdd.map(line => {
      val parts = line.split(separator)
      val tupleId = idConverter.convert(parts(tupleIdIndex).replaceAll("[^0-9]",""))
      val clusterId = idConverter.convert(parts(clusterIdIndex).replaceAll("[^0-9]",""))
      (clusterId, tupleId)
    })
    clusterIdTupleIdRdd.groupByKey().map(_._2.toSeq)
  }

}

object PipeReaderGoldstandardClusterOutput {
  
  def apply(
      separator: Char = ',',
      clusterIdIndex: Int = 0,
      tupleIdIndex: Int = 1,
      idConverter: IdConverter = IdConverterBasic) = {
    new PipeReaderGoldstandardClusterOutput(separator, clusterIdIndex, tupleIdIndex, idConverter)
  }

} 
Example 15
Source File: PipeAnalyseGoldstandard.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.GoldstandardModel

class PipeAnalyseGoldstandard extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {

  override val _analysable = new GoldstandardModel

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    _analysable.goldstandard = input
    pipeContext match {
      case pc: ResultContext => {
        pc.goldstandardModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseGoldstandard {

  def apply() = new PipeAnalyseGoldstandard()

} 
Example 16
Source File: PipePrintHeadGoldstandard.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable


class PipePrintHeadGoldstandard(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping)
  extends PipeElementPassthrough[RDD[SymPair[Tuple]]] with PipeSampler {

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext => {
        val sample: Array[SymPair[Tuple]] = pc.goldstandard.take(count)
        val table: Seq[Seq[String]] = createSymPairTable(sample)
        
        log.info("Goldstandard sample of " + sample.size + " tuples: ")
        Table.printTable(table)
      }
    }
  }

}

object PipePrintHeadGoldstandard {
  
  def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = {
    new PipePrintHeadGoldstandard(count)
  }

} 
Example 17
Source File: PipePrintHeadTuple.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.print

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table


class PipePrintHeadTuple(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping)
  extends PipeElementPassthrough[RDD[Tuple]] with PipeSampler {

  def substep(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): Unit = {
    val sample: Array[Tuple] = input.take(count)
    val table: Seq[Seq[String]] = createTupleTable(sample)
    log.info("Sample of " + sample.size + " tuples: ")
    Table.printTable(table)
  }

}

object PipePrintHeadTuple {

  def apply(count: Int = 10)(implicit fIdNameM: FeatureIdNameMapping) = {
    new PipePrintHeadTuple(count)
  }

} 
Example 18
Source File: PipeWordcount.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.examples

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._
import com.rockymadden.stringmetric.StringMetric
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeWordcount()
  extends PipeElement[RDD[String], RDD[(String, Int)]] {

  def step(input: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[(String, Int)] = {
    // flatten the collection of word arrays
    val words = input.flatMap(line => line.split(" "))
    // initialize the counter of each word with one
    val wordsWithCounter = words.map(word => (word, 1))
    // add up all counters of the same word
    wordsWithCounter.reduceByKey(_ + _)
  }

}

// companion object for a better usability
object PipeWordcount {
  def apply() = new PipeWordcount()
} 
Example 19
Source File: AbstractPipeClusteringGraph.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.clustering

import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.similarity.aggregator.Mean
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable


abstract class AbstractPipeClusteringGraph
  extends PipeElement[RDD[(SymPair[Tuple], Array[Double])], RDD[Set[Tuple]]]
  with Serializable {
  
  def cluster(graph: Graph[Tuple, Double]): RDD[Set[Tuple]]

  def step(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): RDD[Set[Tuple]] = {
    
    val duplicatePairsWithSimilarity = input.map(
      pair => (pair._1, Mean.agrSimilarity(pair._2))
    )
    
    val edges: RDD[Edge[Double]] = duplicatePairsWithSimilarity.map(
      pair => { Edge(pair._1._1.id, pair._1._2.id, pair._2) }
    )

    // TODO optimize: it would be nice to build the graph only by using edge triplets
    // but as far as I know that's not possible
    val verticesNotUnique: RDD[(VertexId, Tuple)] = duplicatePairsWithSimilarity.map(_._1).flatMap(
      tuplePair => Seq(tuplePair._1, tuplePair._2)
    ).map(tuple => (tuple.id, tuple))

    // delete all duplicate vertices
    val vertices = verticesNotUnique.distinct()

    // The edge type Boolean is just a workaround because no edge types are needed
    val graph: Graph[Tuple, Double] = Graph.apply(vertices, edges, null)
    
    cluster(graph)
  }

} 
Example 20
Source File: PipeAnalyseClustering.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.clustering

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.ClusterModel
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext

class PipeAnalyseClustering extends PipeElementPassthrough[RDD[Set[Tuple]]] {

  override val _analysable = new ClusterModel

  def substep(input: RDD[Set[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext with ResultContext => {
        _analysable.clusters = input
        _analysable.goldstandard = pc.goldstandard
        pc.clusterModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseClustering {
  
  def apply() = {
    new PipeAnalyseClustering()
  }
  
} 
Example 21
Source File: PipeWriterTupleCluster.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.writing

import java.io.File

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeWriterTupleCluster(file: File, separator: Char = ',')
  extends PipeElementPassthrough[RDD[Set[Tuple]]] {

  def substep(input: RDD[Set[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    val writer = new TupleWriterFile(file, separator)
    // TODO write tuples to hdfs in parallel and merge them afterwards
    val collected = input.collect()
    collected.foreach(set => {
      set.foreach(tuple => {
        writer.writeTuple(tuple)
      })
      writer.blankLine()
    })
    writer.close()
  }

}

object PipeWriterTupleCluster {

  def apply(file: File, separator: Char = ',') = {
    new PipeWriterTupleCluster(file, separator)
  }

} 
Example 22
Source File: PipeWriterTuplePairs.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.writing

import java.io.File
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeWriterTuplePairs(file: File, separator: Char = ',') extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    val writer = new TupleWriterFile(file, separator)
    val collected = input.collect()
    collected.foreach(pair => {
      writer.writeTuple(pair._1)
      writer.writeTuple(pair._2)
      writer.blankLine()
    })
    writer.close()
  }

}

object PipeWriterTuplePairs {
  
  def apply(file: File, separator: Char = ',') = {
    new PipeWriterTuplePairs(file, separator)
  }

} 
Example 23
Source File: ClusterWriterCsvFile.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.writing

import java.io.File
import java.io.FileWriter

import org.apache.spark.rdd.RDD

import com.opencsv.CSVWriter

import de.unihamburg.vsis.sddf.reading.Tuple

class ClusterWriterCsvFile(file: File, separator: Char = ',') {

  // create folders
  file.getParentFile().mkdirs()

  def this(path: String) = {
    this(new File(path))
  }

  def this(folder: String, file: String) = {
    this(new File(folder, file))
  }

  def write(clusterRdd: RDD[Set[Tuple]]): Unit = {
    val collectedClusters = clusterRdd.collect()
    val writer = new CSVWriter(new FileWriter(file), separator);
    // feed in your array (or convert your data to an array)
    collectedClusters.foreach(set => {
      val tupleIdSet: Set[String] = set.map(tuple => tuple.id.toString())
      val tupleIdArray: Array[String] = tupleIdSet.toArray
      writer.writeNext(tupleIdArray)
    })
    writer.close()
  }
  
} 
Example 24
Source File: TupleWriterFile.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.writing

import java.io.File
import java.io.FileWriter

import org.apache.spark.rdd.RDD

import com.opencsv.CSVWriter

import de.unihamburg.vsis.sddf.reading.Tuple


class TupleWriterFile(file: File, separator: Char = ',') {

  val writer = new CSVWriter(new FileWriter(file), separator);

  def writeTuple[A <: Tuple](tuple: A): Unit = {
    writer.writeNext(tuple.id.toString +: tuple.toSeq.map(_._2).toArray)
  }

  def close() = {
	  writer.close()
  }
  
  def blankLine() = {
    writer.writeNext(Array())
  }
  
  def writeTuple[A <: Tuple](tuples: Traversable[A]): Unit = {
    tuples.foreach(tuple => {
      writer.writeNext(tuple.id.toString +: tuple.toSeq.map(_._2).toArray)
    })
  }

  def writeTuple[A <: Tuple](tuples: RDD[A]): Unit = {
    val collectedTuples = tuples.collect()
    collectedTuples.foreach(tuple => {
      writer.writeNext(tuple.id.toString +: tuple.toSeq.map(_._2).toArray)
    })
  }
} 
Example 25
Source File: DummyIndexer.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable


class PipeIndexerDummy extends IndexingPipe {

  override val name = "DummyIndexer"
  
  def step(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[SymPair[Tuple]] = {
    val cartesian = input.cartesian(input).map(new SymPair(_))
    // filter identities like (a,a) and symmetric duplicates like (a,b) && (b,a)
    cartesian.filter(pair => pair._1 != pair._2).distinct()
  }
  
}

object PipeIndexerDummy {
  def apply() = {
    new PipeIndexerDummy()
  }
} 
Example 26
Source File: PipeAnalyseIndexer.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.IndexingModel

class PipeAnalyseIndexer extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {

  override val _analysable: IndexingModel = new IndexingModel

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: CorpusContext with ResultContext => {
        _analysable.pairs = input
        _analysable.corpus = pc.corpus
        pc.indexingModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseIndexer {
  
  def apply() = new PipeAnalyseIndexer
  
} 
Example 27
Source File: PipeIndexerSortedNeighborhood.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.indexing.blocking.PipeBlockerSortedNeighborhood
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

object PipeIndexerSortedNeighborhood {
  
  def apply(windowSize: Int = 10)(implicit bkvBuilder: BlockingKeyBuilder) = {
    PipeBlockerSortedNeighborhood(windowSize)
    .append(SortedNeighborhoodIndexer())
  }
  
}


  def calcPairCount(elementCount: Int, windowSize: Int): Int = {
    val windowCount = elementCount - windowSize + 1
    val firstWindowPairs = (windowSize * (windowSize - 1)) / 2
    val lastWindowPairs = (windowCount - 1) * (windowSize - 1)
    firstWindowPairs + lastWindowPairs
  }
} 
Example 28
Source File: PipeAnalyseIndexerExtended.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.IndexingModelExtended

class PipeAnalyseIndexerExtended extends PipeElementPassthrough[RDD[SymPair[Tuple]]] {

  override val _analysable: IndexingModelExtended = new IndexingModelExtended

  def substep(input: RDD[SymPair[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext with CorpusContext with ResultContext => {
        _analysable.pairs = input
        _analysable.goldstandard = pc.goldstandard
        _analysable.corpus = pc.corpus
        pc.indexingModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseIndexerExtended {
  
  def apply() = new PipeAnalyseIndexerExtended
  
} 
Example 29
Source File: PipeAnalyseBlocker.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.visualisation.model.IndexingModel
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.visualisation.model.BlockingModel

class PipeAnalyseBlocker extends PipeElementPassthrough[RDD[Seq[Tuple]]] {

  override val _analysable: BlockingModel = new BlockingModel

  def substep(input: RDD[Seq[Tuple]])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext with CorpusContext with ResultContext => {
        _analysable.blocks = input
        pc.blockingModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}

object PipeAnalyseBlocker {
  
  def apply() = new PipeAnalyseBlocker
  
} 
Example 30
Source File: PipeBlockerStandard.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable


  def step(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Tuple]] = {
    val bkvTuplePairs: RDD[(String, Tuple)] = input.map(t => (bkvBuilder.buildBlockingKey(t), t))
    val keyBlocks: RDD[(String, Iterable[Tuple])] = bkvTuplePairs.groupByKey
    keyBlocks.map(_._2.toSeq).filter(_.size > 1)
  }

  @transient override val _analysable = new AlgoAnalysable
  _analysable.algo = this
  _analysable.name = this.name
  override val name = "StandardBlocker"
  override val paramMap = Map("BlockingKeyBuilder" -> bkvBuilder)

}

object PipeBlockerStandard {

  def apply(implicit bkvBuilder: BlockingKeyBuilder) = {
    new PipeBlockerStandard()
  }

} 
Example 31
Source File: PipeBlockerSortedNeighborhood.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.mllib.rdd.RDDFunctions.fromRDD
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable

class PipeBlockerSortedNeighborhood(windowSize: Int = 10)(implicit bkvBuilder: BlockingKeyBuilder)
    extends BlockingPipe
    with Parameterized {

  def step(tuples: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Tuple]] = {
    val bkvTuplePairs: RDD[(String, Tuple)] = tuples.map(t => (bkvBuilder.buildBlockingKey(t), t))
    val sortedPairs = bkvTuplePairs.sortByKey().map(_._2)
    sortedPairs.sliding(windowSize).map(_.toSeq)
  }

  @transient override val _analysable = new AlgoAnalysable
  _analysable.algo = this
  _analysable.name = this.name
  override val name = "SortedNeighborhoodBlocker"
  override val paramMap = Map("windowSize" -> windowSize,
    "BlockingKeyBuilder" -> bkvBuilder)

}

object PipeBlockerSortedNeighborhood {

  def apply(windowSize: Int = 10)(implicit bkvBuilder: BlockingKeyBuilder) = {
    new PipeBlockerSortedNeighborhood(windowSize)
  }

} 
Example 32
Source File: PipeBlockerSuffixArray.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable


  def filterBlocks(suffixTuplePair: (String, Seq[Tuple])): Boolean = {
    val tupleCount = suffixTuplePair._2.length
    if (tupleCount > maximumBlockSize) {
      false
    } else if (tupleCount < 2) {
      false
    } else {
      true
    }
  }
}

object PipeBlockerSuffixArray {

  def apply(minimumSuffixLength: Int = 6, maximumBlockSize: Int = 12)(
    implicit bkvBuilder: BlockingKeyBuilder) = {
    new PipeBlockerSuffixArray(minimumSuffixLength, maximumBlockSize)
  }

} 
Example 33
Source File: SddfPipeContext.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.pipe.context

import org.apache.spark.rdd.RDD
import org.joda.time.Period

import de.unihamburg.vsis.sddf.visualisation.ModelRouter
import de.unihamburg.vsis.sddf.visualisation.logger.ModelRouterLogging

class SddfPipeContext(
    val name: String = "Unnamed Pipeline",
    modelRouter: ModelRouter = ModelRouterLogging)
  extends AbstractPipeContext(modelRouter)
  with CorpusContext
  with GoldstandardContext
  with ResultContext {
  
  var runtime: Option[Period] = None
  var filepath: Option[String] = None
      
  val persistedRDDs = new scala.collection.mutable.HashMap[String, RDD[_]]()
  
} 
Example 34
Source File: PipeOptimizeUnpersist.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.pipe.optimize

import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext

class PipeOptimizeUnpersist[A](rddname: String) extends PipeElementPassthrough[RDD[A]] {

  def substep(input: RDD[A])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: SddfPipeContext => {
        val rddOption = pc.persistedRDDs.get(rddname)
        if (rddOption.isDefined) {
          rddOption.get.unpersist()
          analysable.values += ("RDD unpersisted" -> rddname)
        } else {
          log.warn("Can't unpersist RDD with the name " + rddname)
        }
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }
}

object PipeOptimizeUnpersist {

  def apply[A](rddname: String) = {
    new PipeOptimizeUnpersist[A](rddname)
  }

} 
Example 35
Source File: PipeOptimizePersistAndName.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.pipe.optimize

import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext

class PipeOptimizePersistAndName[A](rddname: String = null, newLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends PipeElementPassthrough[RDD[A]] {
  
  def substep(input: RDD[A])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: SddfPipeContext => {
        input.persist(newLevel)
        if(rddname != null){
          input.name = rddname
          pc.persistedRDDs += (rddname -> input)
          analysable.values += ("name" -> rddname)
        }
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }
}

object PipeOptimizePersistAndName {
  
  def apply[A](rddname: String = null, newLevel: StorageLevel = StorageLevel.MEMORY_ONLY) = {
    new PipeOptimizePersistAndName[A](rddname, newLevel)
  }

} 
Example 36
Source File: RddUtils.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.sparkextensions

import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD

object RddUtils {

  
  def securlyZipRdds[A, B: ClassTag](rdd1: RDD[A], rdd2: RDD[B]): RDD[(A, B)] = {
    val rdd1Repartitioned = rdd1.repartition(1)
    val rdd2Repartitioned = rdd2.repartition(1)
    val (rdd1Balanced, rdd2Balanced) = balanceRddSizes(rdd1Repartitioned, rdd2Repartitioned)
    rdd1Balanced.zip(rdd2Balanced)
  }

  def balanceRddSizes[A, B](rdd1: RDD[A], rdd2: RDD[B]): (RDD[A], RDD[B]) = {
    val rdd1count = rdd1.count()
    val rdd2count = rdd2.count()
    val difference = math.abs(rdd1count - rdd2count).toInt
    if (rdd1count > rdd2count) {
      (removeRandomElements(rdd1, difference), rdd2)
    } else if (rdd2count > rdd1count) {
      (rdd1, removeRandomElements(rdd2, difference))
    } else {
      (rdd1, rdd2)
    }
  }

  def removeRandomElements[A](rdd: RDD[A], numberOfElements: Int): RDD[A] = {
    val sample: Array[A] = rdd.takeSample(false, numberOfElements)
    val set: Set[A] = Set(sample: _*)
    rdd.filter(x => if (set.contains(x)) false else true)
  }

} 
Example 37
Source File: PipePrintHeadFalsePositives.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

class PipePrintHeadFalsePositives(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends AbstractPipePrintFalseTuples(count) {

  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = {
    input.subtract(goldstandard)
  }

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = {
    falseTuplesWithSimilarity.take(count)
  }
  
    def logMessage(count: Int): String = {
    "Printing " + count + " first false positives. (duplicate pairs which were not found)"
  }

}

object PipePrintHeadFalsePositives {
  
  def apply(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipePrintHeadFalsePositives(count)
  }

} 
Example 38
Source File: PipeClassificationNaiveBayes.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import scala.beans.BeanInfo
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import org.apache.spark.mllib.classification.NaiveBayesModel


class PipeClassificationNaiveBayes(lambda: Double = 1.0) extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("lambda", lambda))

    def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    
    val model = NaiveBayes.train(trainingData, lambda)

    log.debug("Classification Model:" + model)
    log.debug("Classification Model labels :" + model.labels.mkString(" "))
    log.debug("Classification Model pi:     " + model.pi.mkString(" "))
    log.debug("Classification Model theta:  " + model.theta.foreach(_.mkString(" ")))

    // Marking Missing Values as Not Equal (0)
    symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))
  }

}

object PipeClassificationNaiveBayes {
  def apply(lambda: Double = 1.0) = {
    new PipeClassificationNaiveBayes(lambda)
  }
} 
Example 39
Source File: PipeClassificationTrainingDataGenerator.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import scala.compat.Platform

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.similarity.SimilarityCalculator
import de.unihamburg.vsis.sddf.sparkextensions.RddUtils.securlyZipRdds
import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeClassificationTrainingDataGenerator(
  truePositiveCount: Int = 500,
  trueNegativeCount: Int = 500)(
  implicit featureMeasures: Array[(Int, StringMetric[Double])])
  extends PipeElement[SymPairSim, (SymPairSim, RDD[LabeledPoint])]
  with Logging {

  override def step(input: SymPairSim)(implicit pipeContext: AbstractPipeContext) = {
    pipeContext match {
      case pc: GoldstandardContext with CorpusContext => {
        var truePositiveFraction = truePositiveCount / pc.goldstandard.count.toDouble
        var trueNegativeFraction = trueNegativeCount / pc.corpus.count.toDouble
        log.debug("True positive pair fraction taken from the gold standard for training purposes: " + truePositiveFraction)
        log.debug("True negative pair fraction taken from the corpus for training purposes: " + trueNegativeFraction)
        if (truePositiveFraction > 1.0) {
          truePositiveFraction = 1.0
          log.debug("True positive pair fraction limited to 1.0")
        }
        if (trueNegativeFraction > 1.0) {
          trueNegativeFraction = 1.0
          log.debug("True negative pair fraction limited to 1.0")
        }
        val result = generateTrainingData(pc.corpus, pc.goldstandard,
          truePositiveFraction, trueNegativeFraction)
        (input, result)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

  
object PipeClassificationTrainingDataGenerator {

  val All = -1
  
  def apply(
      truePositiveCount: Int = 500,
      trueNegativeCount: Int = 500)(
      implicit featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipeClassificationTrainingDataGenerator(truePositiveCount, trueNegativeCount)
  }

} 
Example 40
Source File: PipeClassificationDecisionTree.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable
import de.unihamburg.vsis.sddf.Parameterized
import org.apache.spark.mllib.classification.ClassificationModel

class PipeClassificationDecisionTree(
    impurity: String = "gini",
    maxDepth: Int = 5,
    maxBins: Int = 32)
  extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("impurity", impurity), ("maxDepth", maxDepth), ("maxBins", maxBins))

  def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    val model = DecisionTree.trainClassifier(trainingData, numClasses = 2,
      categoricalFeaturesInfo = Map[Int, Int](), impurity, maxDepth, maxBins)

    log.debug("Decision Tree Model:" + model)
    log.debug("Decision Tree:" + model.toDebugString)

    // Marking Missing Values as Not Equal (0)
    symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))
  }

}

object PipeClassificationDecisionTree {
  def apply(
    impurity: String = "gini",
    maxDepth: Int = 5,
    maxBins: Int = 32) = {
    new PipeClassificationDecisionTree(impurity, maxDepth, maxBins)
  }
} 
Example 41
Source File: PipeClassificationSvm.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import scala.beans.BeanInfo
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import org.apache.spark.mllib.classification.SVMWithSGD

class PipeClassificationSvm(numIterations: Int = 100) extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("numIterations", numIterations))

    def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    
    val model = SVMWithSGD.train(trainingData, numIterations)

    log.debug("Classification Model:" + model)

    // Marking Missing Values as Not Equal (0)
    symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))
  }

}

object PipeClassificationSvm {
  def apply(numIterations: Int = 100) = {
    new PipeClassificationSvm(numIterations)
  }
} 
Example 42
Source File: PipePrintHeadFalseNegatives.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

class PipePrintHeadFalseNegatives(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends AbstractPipePrintFalseTuples(count) {

  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = {
    goldstandard.subtract(input)
  }

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = {
    falseTuplesWithSimilarity.take(count)
  }
  
  def logMessage(count: Int): String = {
    "Printing " + count + " first false negatives. (duplicate pairs which are no duplicates)"
  }

}

object PipePrintHeadFalseNegatives {
  
  def apply(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipePrintHeadFalseNegatives(count)
  }

} 
Example 43
Source File: PipePrintSampleFalseNegatives.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

class PipePrintSampleFalseNegatives(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends AbstractPipePrintFalseTuples(count) {

  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = {
    goldstandard.subtract(input)
  }

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = {
    falseTuplesWithSimilarity.takeSample(false, count)
  }

  def logMessage(count: Int): String = {
    "Sampling " + count + " false negatives. (duplicate pairs which are no duplicates)"
  }

}

object PipePrintSampleFalseNegatives {
  
  def apply(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping, 
    featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipePrintSampleFalseNegatives(count)
  }

} 
Example 44
Source File: PipeAnalyseClassificationTraining.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.ResultContext
import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel

class PipeAnalyseClassificationTraining
  extends PipeElementPassthrough[(SymPairSim, RDD[LabeledPoint])] {

  override val _analysable: TrainingSetModel = new TrainingSetModel

  def substep(
      input: (SymPairSim, RDD[LabeledPoint]))(
      implicit pipeContext: AbstractPipeContext): Unit = {
    _analysable.trainingsSetLabeled = input._2
    pipeContext match {
      case pc: ResultContext => {
        pc.trainingSetModel = Some(_analysable)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

}


object PipeAnalyseClassificationTraining {

  def apply() = new PipeAnalyseClassificationTraining

} 
Example 45
Source File: PipePrintSampleFalsePositives.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple

class PipePrintSampleFalsePositives(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends AbstractPipePrintFalseTuples(count) {
  
  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]) = {
    input.subtract(goldstandard)
  }

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]) = {
    falseTuplesWithSimilarity.takeSample(false, count)
  }
  
  def logMessage(count: Int): String = {
    "Sampling " + count + " false positives. (duplicate pairs which were not found)"
  }

}

object PipePrintSampleFalsePositives {
  
  def apply(
    count: Int = 10)(
    implicit featureIdNameMapping: FeatureIdNameMapping, 
    featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipePrintSampleFalsePositives(count)
  }

} 
Example 46
Source File: AbstractPipeClassification.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable

abstract class AbstractPipeClassification()
  extends PipeElement[(SymPairSim, RDD[LabeledPoint]), SymPairSim]
  with Parameterized {

  override val _analysable = new AlgoAnalysable
  _analysable.algo = this

  
  def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)]

  def step(input: (SymPairSim, RDD[LabeledPoint]))(implicit pipeContext: AbstractPipeContext): SymPairSim = {
    pipeContext match {
      case pc: CorpusContext with GoldstandardContext => {

        val symPairSim = input._1
        val trainingsSet = input._2

        val prediction = trainModelAndClassify(trainingsSet, symPairSim)

        val duplicatePairs = prediction.filter(_._3 == Duplicate).map(tri => (tri._1, tri._2))

        duplicatePairs
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

} 
Example 47
Source File: AbstractPipePrintFalseTuples.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.classification

import org.apache.spark.SparkContext.rddToPairRDDFunctions
import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.pipe.PipeElementPassthrough
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.PipeSampler
import de.unihamburg.vsis.sddf.visualisation.Table
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

abstract class AbstractPipePrintFalseTuples(
  count: Int)(
    implicit featureIdNameMapping: FeatureIdNameMapping,
    featureMeasures: Array[(Int, StringMetric[Double])])
  extends PipeElementPassthrough[RDD[(SymPair[Tuple], Array[Double])]]
  with PipeSampler {

  def selectFalseTuples(goldstandard: RDD[SymPair[Tuple]], input: RDD[SymPair[Tuple]]): RDD[SymPair[Tuple]]

  def filterFalseTuplesForOutput(falseTuplesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])]): Array[(SymPair[Tuple], Array[Double])]

  def logMessage(count: Int): String

  def substep(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): Unit = {
    pipeContext match {
      case pc: GoldstandardContext => {

        val falseTuples = selectFalseTuples(pc.goldstandard, input.map(_._1))

        if (falseTuples.count > 0) {
          val dummyValue: RDD[(SymPair[Tuple], Int)] = falseTuples.map((_, 1))
          val join: RDD[(SymPair[Tuple], (Int, Option[Array[Double]]))] = dummyValue.leftOuterJoin(input)
          val falsePositivesWithSimilarity: RDD[(SymPair[Tuple], Array[Double])] = join.map(pair => {
            (pair._1, pair._2._2.getOrElse(Array()))
          })

          val falseTuplesSample = filterFalseTuplesForOutput(falsePositivesWithSimilarity)

          val table = createSymPairSimVectorTable(falseTuplesSample)
          log.info(logMessage(count))
          Table.printTable(table)
        } else {
          log.info(logMessage(0))
        }
      }
    }
  }

} 
Example 48
Source File: ExactDuplicateFilter.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.tools

import java.io.File

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Id
import de.unihamburg.vsis.sddf.reading.corpus.PipeStoreInContextCorpus
import de.unihamburg.vsis.sddf.reading.corpus.PipePrintSampleCorpus
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.corpus.PipeReaderTupleCsv
import de.unihamburg.vsis.sddf.writing.TupleWriterFile


object ExactDuplicateFilter extends App with Logging {

  if (args.size == 1 && (new File(args(0))).exists()) {
    val conf = new SparkConf().setAppName("ExactDuplicateFilter")
    conf.setMaster("local")
    val sc = new SparkContext(conf)

    implicit val pipeContext = new SddfPipeContext
      
    val Content: (Int, String) = (0, "content")

    val featureMapping: Map[Int, String] = Map(Content)

    implicit val featureIdNameMapper = new FeatureIdNameMapping(featureMapping)

    val inputFileKey = "musicbrainz"

    // Parse Tuples
    val allFields: Seq[Int] = Seq(Content._1)
    val allFieldsWithId: Seq[Int] = Id +: allFields

    val parserPipe = new PipeTupleParserCsvIdContent(allFieldsWithId)
    val pipe = parserPipe.append(PipeStoreInContextCorpus()).append(PipePrintSampleCorpus())
    pipe.start(sc.textFile(args(0)))
    val result: RDD[Tuple] = parserPipe.output.get
    val resultCount = result.count
    log.info("Lines parsed: " + resultCount)
    
    val distinct = result.distinct()
    val distinctCount = distinct.count
    log.info("Distinct Lines Count: " + distinctCount)
    log.info("Lines removed: " + (resultCount - distinctCount))
    
    val tupleWriter = new TupleWriterFile(new File(args(0) + ".distinct"))
    tupleWriter.writeTuple(distinct)

  } else {
    println("Please provide a valid file path.")
  }

}

class PipeTupleParserCsvIdContent(featureIds: Seq[Int]) extends PipeReaderTupleCsv(featureIds) {
  override def extractValues(line: String): Seq[String] = {
    val splitted = parser.parseLine(line)
    Seq(splitted.head, splitted.tail.mkString(","))
  }
} 
Example 49
Source File: PipeGoldstandardReaderClusterTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.reading.goldstandard

import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.goldstandard.PipeReaderGoldstandardIdToTuple
import de.unihamburg.vsis.sddf.reading.goldstandard.PipeReaderGoldstandardIdsCluster
import de.unihamburg.vsis.sddf.test.util.FixtureHelper
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext

class PipeReaderGoldstandardClusterTest
  extends FunSuite
  with LocalSparkContext
  with TestSddfPipeContext
  with FixtureHelper {

  test("test goldstandard tuple reading in cluster format") {
    // format clusterId, tupleId
    val input: RDD[String] = sc.parallelize(Seq("1,1", "2,2", "2,3"))
    val gsReaderPipe = PipeReaderGoldstandardIdsCluster()
    gsReaderPipe.start(input)
    val gsIds = gsReaderPipe.output.get
    assert(gsIds.count() === 1)

    val tuples: Seq[Tuple] = initializeTuples(1, 3)
    pc.corpus = sc.parallelize(tuples)
    val gsconverterPipe = new PipeReaderGoldstandardIdToTuple
    gsconverterPipe.start(gsIds)
    val gsTuple = gsconverterPipe.output.get
    assert(gsTuple.count() === 1)
  }

  test("test goldstandard id reading in cluster format") {
    // format clusterId, tupleId
    val input: RDD[String] = sc.parallelize(Seq("1,1", "2,2", "2,3"))
    val gsReaderPipe = PipeReaderGoldstandardIdsCluster()
    gsReaderPipe.start(input)
    val result = gsReaderPipe.output.get
    assert(result.count() === 1)
  }

  test("test goldstandard cluster reader from file") {
    val input = sc.textFile("src/test/resources/musicbrainz-1000.csv.dup")
    val gsReaderPipe = PipeReaderGoldstandardIdsCluster()
    gsReaderPipe.start(input)
    val result = gsReaderPipe.output.get
    assert(result.collect().size === 13)
  }

} 
Example 50
Source File: StrongestPathClusteringTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.clustering

import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.clustering.PipeClusteringStrongestPath
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.test.util.FixtureHelper
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext

class StrongestPathClusteringTest
  extends FunSuite
  with LocalSparkContext
  with TestSddfPipeContext
  with FixtureHelper {

  test("simple cluster test") {
    val pair1 = (createTuplePair(1, 2), Array(0.4, 0.6))
    val pair2 = (createTuplePair(2, 4), Array(0.1, 0.2))
    val pair3 = (createTuplePair(4, 3), Array(0.6, 0.8))
    val pair4 = (createTuplePair(3, 1), Array(0.0, 0.2))

    val pairs: RDD[(SymPair[Tuple], Array[Double])] = sc.parallelize(Seq(pair1, pair2, pair3, pair4))
    val clusterer = new PipeClusteringStrongestPath
    clusterer.start(pairs)
    val clusterResult: Array[Set[Tuple]] = clusterer.output.get.collect()

    val expectedResult = Array(Set(pair1._1._1, pair1._1._2), Set(pair3._1._1, pair3._1._2))
    assert(clusterResult === expectedResult)
  }

} 
Example 51
Source File: ClusterAnalyserTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.evaluation

import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.test.util.FixtureHelper
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.visualisation.model.ClusterModel

class ClusterAnalyserTest extends FunSuite with LocalSparkContext with FixtureHelper {
  test("Precission and recall test") {

    val analyser = new ClusterModel
    analyser.clusters = buildClusters()
    analyser.goldstandard = buildGoldstandard()

    assert(analyser.precision === 0.2857142857142857) // should be 2/7
    assert(analyser.recall === 0.6666666666666666) // should be 2/3

  }

  def buildClusters(): RDD[Set[Tuple]] = {
    val cluster1 = initializeTuples(0, 2).toSet
    val cluster2 = initializeTuples(3, 4).toSet
    val cluster3 = initializeTuples(5, 7).toSet

    sc.parallelize(Seq(cluster1, cluster2, cluster3))
  }

  def buildGoldstandard(): RDD[SymPair[Tuple]] = {
    val pair1 = createTuplePair(0, 1)
    val pair2 = createTuplePair(4, 7)
    val pair3 = createTuplePair(6, 7)

    sc.parallelize(Seq(pair1, pair2, pair3))
  }
  
} 
Example 52
Source File: SparkApiTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test

import org.apache.spark.rdd.RDD
import org.scalatest.Finders
import org.scalatest.FunSuite
import de.unihamburg.vsis.sddf.SddfContext.pairToInt
import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorRemoveRegex
import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorTrim
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Id
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Ignore
import de.unihamburg.vsis.sddf.reading.corpus.PipeReaderTupleCsv
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.MusicbrainzSchema

class SparkApiTest extends FunSuite with LocalSparkContext with MusicbrainzSchema {

  test("test rdd substraction") {

    val file1 = sc.textFile("src/test/resources/musicbrainz-10.csv.dup")
    val file2 = sc.textFile("src/test/resources/musicbrainz-10.csv.dup")
    
    val data1 = parseTuples(file1)
    assert(data1.count() === 10)
    val data2 = parseTuples(file2)
    assert(data2.count() === 10)
    val substraction = data1.subtract(data2)
    assert(substraction.count() === 0)
  }

} 
Example 53
Source File: PipeDecisionTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.classification

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.scalatest.BeforeAndAfterAll
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.classification.PipeClassificationDecisionTree
import de.unihamburg.vsis.sddf.classification.PipeClassificationNaiveBayes
import de.unihamburg.vsis.sddf.classification.PipeClassificationSvm
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext

class PipeClassificationTest extends FunSuite with LocalSparkContext with BeforeAndAfterAll{
  
  var input: (SymPairSim, RDD[LabeledPoint]) = _
  
  override def beforeAll() {
    super.beforeAll()
    val tuple1 = Tuple("test1","test1","test1")
    tuple1.id = 1
    val tuple2 = Tuple("test2","test2","test2")
    tuple2.id = 2
    val tuple3 = Tuple("hans","franz","wurst")
    tuple3.id = 3
    
    val symPairSim: SymPairSim = sc.parallelize(Seq(
      (new SymPair(tuple1, tuple2), Array(1D,1D,0D))
      ,(new SymPair(tuple2, tuple3), Array(0D,0D,1D))
    ))
    
    val trainingData: RDD[LabeledPoint] = sc.parallelize(Seq(
      LabeledPoint(label = Duplicate, features = Vectors.dense(Array(0.99,1.0,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.875,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.1)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.89,0.0)))
      
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.1,0.0,1.0)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.0,0.2,1.0)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.06,0.0,0.89)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.21,0.19,0.91)))
    ))
    
    input = (symPairSim, trainingData)
  }

  override def afterAll() {
    super.afterAll()
  }
              
	test("naive bayes classification test") {
    val classificationPipe = new PipeClassificationNaiveBayes()
    implicit val pipeContext = new SddfPipeContext()
    val result = classificationPipe.run(input)
    assert(result.count === 1)
  }
  
  test("svm classification test") {
    val classificationPipe = new PipeClassificationSvm()
    implicit val pipeContext = new SddfPipeContext()
    val result = classificationPipe.run(input)
    assert(result.count === 1)
  }

  test("decision tree classification test") {
    val classificationPipe = new PipeClassificationDecisionTree()
    implicit val pipeContext = new SddfPipeContext()
    val result = classificationPipe.run(input)
    assert(result.count === 1)
  }

} 
Example 54
Source File: MusicbrainzSchema.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.util

import org.apache.spark.rdd.RDD
import org.scalatest.Suite

import de.unihamburg.vsis.sddf.SddfContext.pairToInt
import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorRemoveRegex
import de.unihamburg.vsis.sddf.preprocessing.PipePreprocessorTrim
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Id
import de.unihamburg.vsis.sddf.reading.FeatureIdNameMapping.Ignore
import de.unihamburg.vsis.sddf.reading.corpus.PipeReaderTupleCsv

trait MusicbrainzSchema extends TestSddfPipeContext { self: Suite =>

  val Number = (0, "number")
  val Title = (1, "title")
  val Length = (2, "length")
  val Artist = (3, "artist")
  val Album = (4, "album")
  val Year = (5, "year")
  val Language = (6, "language")

  val featureIdNameMapping = Map(Number, Title, Length, Artist, Album, Year, Language)

  implicit val featureIdNameMapper = new FeatureIdNameMapping(featureIdNameMapping)

  def parseTuples(input: RDD[String]) = {
    // Parse Tuples
    val allFields: Seq[Int] = Seq(Number, Title, Length, Artist, Album, Year, Language)
    val allFieldsWithId: Seq[Int] = Ignore +: Id +: Ignore +: allFields

    val pipe = PipeReaderTupleCsv(allFieldsWithId)
      .append(PipePreprocessorTrim(allFields: _*))
      .append(PipePreprocessorRemoveRegex("[^0-9]", Number, Year, Length))

    pipe.run(input)

  }

} 
Example 55
Source File: SortedNeighbourhoodBlockerTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.blocking

import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite
import org.scalatest.Matchers
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilderBasic
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.TupleArray
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext
import de.unihamburg.vsis.sddf.indexing.PipeIndexerSortedNeighborhood
import de.unihamburg.vsis.sddf.indexing.PipeIndexerSortedNeighborhood

class SortedNeighborhoodIndexingTest
  extends FunSuite
  with LocalSparkContext
  with TestSddfPipeContext
  with Matchers {

  test("testing whole Sorted Neighborhood Indexer") {
    val featureId = 1
    implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 6))

    val tuple1: Tuple = new TupleArray(1)
    tuple1.addFeature(0, "blubluba")
    tuple1.id = 1
    val tuple2: Tuple = new TupleArray(1)
    tuple2.addFeature(0, "blubluba")
    tuple2.id = 2
    val tuple3: Tuple = new TupleArray(1)
    tuple3.addFeature(0, "blubluba")
    tuple3.id = 3
    val tuple4: Tuple = new TupleArray(1)
    tuple4.addFeature(0, "blubluba")
    tuple4.id = 4
    val tuple5: Tuple = new TupleArray(1)
    tuple5.addFeature(0, "blubluba")
    tuple5.id = 5
    val tuples = sc.parallelize(Seq(tuple1, tuple2, tuple3, tuple4, tuple5))

    val indexer = PipeIndexerSortedNeighborhood(windowSize = 3)
    val blockingResult: RDD[SymPair[Tuple]] = indexer.run(tuples)
    assert(blockingResult.count === 7)

    val resultArray = blockingResult.collect()
    resultArray.foreach(println(_))
    val expectedResult = Seq(
      new SymPair(tuple1, tuple2), new SymPair(tuple1, tuple3), new SymPair(tuple2, tuple3), new SymPair(tuple2, tuple4), new SymPair(tuple3, tuple4), new SymPair(tuple3, tuple5), new SymPair(tuple4, tuple5)
    )

    resultArray should contain theSameElementsAs expectedResult
  }

} 
Example 56
Source File: SuffixArrayBlockingTest.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.test.blocking

import org.apache.spark.rdd.RDD
import org.scalatest.Finders
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.indexing.PipeIndexerSuffixArray
import de.unihamburg.vsis.sddf.indexing.blocking.PipeBlockerSuffixArray
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilderBasic
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.reading.TupleArray
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext
import de.unihamburg.vsis.sddf.test.util.TestSddfPipeContext

class SuffixArrayIndexingTest extends FunSuite with LocalSparkContext with TestSddfPipeContext {

  test("testing suffix calculation") {
    val featureId = 0
    implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 2))

    val tuple1: Tuple = new TupleArray(1)
    tuple1.addFeature(0, "blockingkeyvalue")
    tuple1.id = 1
    val tuples: RDD[Tuple] = sc.parallelize(Seq(tuple1))

    val sab = PipeBlockerSuffixArray(minimumSuffixLength = 4, maximumBlockSize = 12)

    val suffixTuplePairs: Seq[(String, Tuple)] = sab.calcSuffixes(("blockingkeyvalue", tuple1))

    //    println(suffixTuplePairs.map(_._1).mkString("\n"))

    assert(suffixTuplePairs.length === 13)

  }

  test("testing filter blocks") {
    val featureId = 0
    implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 2))

    val tuple1: Tuple = new TupleArray(1)
    tuple1.addFeature(0, "blockingkeyvalue")
    tuple1.id = 1
    val tuples = sc.parallelize(Seq(tuple1))

    val sab = new PipeBlockerSuffixArray(minimumSuffixLength = 4, maximumBlockSize = 4)

    val suffixTuplePair = ("bla", Seq(tuple1, tuple1, tuple1, tuple1, tuple1))
    assert(sab.filterBlocks(suffixTuplePair) === false)

    val suffixTuplePair2 = ("bla", Seq(tuple1, tuple1, tuple1, tuple1))
    assert(sab.filterBlocks(suffixTuplePair2) === true)

    val suffixTuplePair3 = ("bla", Seq(tuple1))
    assert(sab.filterBlocks(suffixTuplePair3) === false)

    val suffixTuplePair4 = ("bla", Seq(tuple1, tuple1))
    assert(sab.filterBlocks(suffixTuplePair4) === true)
  }

  test("testing whole SAB") {
    val featureId = 0
    implicit val bkvBuilder = new BlockingKeyBuilderBasic((featureId, 0 to 6))

    val tuple1: Tuple = new TupleArray(1)
    tuple1.addFeature(0, "blubluba")
    tuple1.id = 1
    val tuple2: Tuple = new TupleArray(1)
    tuple2.addFeature(0, "blubluba")
    tuple2.id = 2
    val tuple3: Tuple = new TupleArray(1)
    tuple3.addFeature(0, "blubluba")
    tuple3.id = 3
    val tuples = sc.parallelize(Seq(tuple1, tuple2, tuple3))

    val sab = PipeIndexerSuffixArray(minimumSuffixLength = 4, maximumBlockSize = 12)
    val blockingResult: RDD[SymPair[Tuple]] = sab.run(tuples)
    // print(blockingResult.collect().map(symPair => (symPair._1.id, symPair._2.id)).mkString("\n"))
    assert(blockingResult.count === 3)
  }

} 
Example 57
Source File: BisectingKMeansModel.scala    From bisecting-kmeans   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.bisectingkmeans

import breeze.linalg.{Vector => BV, norm => breezeNorm}

import org.apache.spark.Logging
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD


  def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
    val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
    this.node.toLinkageMatrix.foreach {x =>
      val row = new java.util.ArrayList[java.lang.Double]()
      row.add(x._1.toDouble)
      row.add(x._2.toDouble)
      row.add(x._3.toDouble)
      row.add(x._4.toDouble)
      javaList.add(row)
    }
    javaList
  }
} 
Example 58
Source File: TestFFM.scala    From spark-ffm   with Apache License 2.0 5 votes vote down vote up
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification._
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.rdd.RDD


object TestFFM extends App {

  override def main(args: Array[String]): Unit = {

    val sc = new SparkContext(new SparkConf().setAppName("TESTFFM").setMaster("local[4]"))

    if (args.length != 8) {
      println("testFFM <train_file> <k> <n_iters> <eta> <lambda> " + "<normal> <random>")
    }

    val data= sc.textFile(args(0)).map(_.split("\\s")).map(x => {
      val y = if(x(0).toInt > 0 ) 1.0 else -1.0
      val nodeArray: Array[(Int, Int, Double)] = x.drop(1).map(_.split(":")).map(x => {
        (x(0).toInt, x(1).toInt, x(2).toDouble)
      })
      (y, nodeArray)
    }).repartition(4)
    val splits = data.randomSplit(Array(0.7, 0.3))
    val (training: RDD[(Double, Array[(Int, Int, Double)])], testing) = (splits(0), splits(1))

    //sometimes the max feature/field number would be different in training/testing dataset,
    // so use the whole dataset to get the max feature/field number
    val m = data.flatMap(x=>x._2).map(_._1).collect.reduceLeft(_ max _) //+ 1
    val n = data.flatMap(x=>x._2).map(_._2).collect.reduceLeft(_ max _) //+ 1

    val ffm: FFMModel = FFMWithAdag.train(training, m, n, dim = (args(6).toBoolean, args(7).toBoolean, args(1).toInt), n_iters = args(2).toInt,
      eta = args(3).toDouble, regParam = (args(4).toDouble, args(5).toDouble), normalization = false, false, "adagrad")

    val scores: RDD[(Double, Double)] = testing.map(x => {
      val p = ffm.predict(x._2)
      val ret = if (p >= 0.5) 1.0 else -1.0
      (ret, x._1)
    })

    val metrics = new BinaryClassificationMetrics(scores)
    val auROC = metrics.areaUnderROC
    val auPRC = metrics.areaUnderPR
    val accuracy = scores.filter(x => x._1 == x._2).count().toDouble / scores.count()
    println(s"accuracy = $accuracy, Area under ROC = $auROC, Area under precision-recall curve = $auPRC")
  }
} 
Example 59
Source File: InferSchema.scala    From Linkis   with Apache License 2.0 5 votes vote down vote up
package com.webank.wedatasphere.spark.excel

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._


private[excel] object InferSchema {

  type CellType = Int

  
  private[excel] def inferField(typeSoFar: DataType, field: DataType): DataType = {
    // Defining a function to return the StringType constant is necessary in order to work around
    // a Scala compiler issue which leads to runtime incompatibilities with certain Spark versions;
    // see issue #128 for more details.
    def stringType(): DataType = {
      StringType
    }

    if (field == NullType) {
      typeSoFar
    } else {
      (typeSoFar, field) match {
        case (NullType, ct) => ct
        case (DoubleType, DoubleType) => DoubleType
        case (BooleanType, BooleanType) => BooleanType
        case (TimestampType, TimestampType) => TimestampType
        case (StringType, _) => stringType()
        case (_, _) => stringType()
      }
    }
  }


  private val numericPrecedence: IndexedSeq[DataType] =
    IndexedSeq[DataType](ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, TimestampType)


  val findTightestCommonType: (DataType, DataType) => Option[DataType] = {
    case (t1, t2) if t1 == t2 => Some(t1)
    case (NullType, t1) => Some(t1)
    case (t1, NullType) => Some(t1)
    case (StringType, t2) => Some(StringType)
    case (t1, StringType) => Some(StringType)

    // Promote numeric types to the highest of the two and all numeric types to unlimited decimal
    case (t1, t2) if Seq(t1, t2).forall(numericPrecedence.contains) =>
      val index = numericPrecedence.lastIndexWhere(t => t == t1 || t == t2)
      Some(numericPrecedence(index))

    case _ => None
  }
} 
Example 60
Source File: DatabaseInteraction.scala    From reactive-machine-learning-systems   with MIT License 5 votes vote down vote up
package com.reactivemachinelearning

import com.couchbase.client.java.document.JsonDocument
import com.couchbase.client.java.view.ViewQuery
import com.couchbase.spark._
import com.reactivemachinelearning.FeatureGeneration.{IntFeature, BooleanFeature, Feature}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object DatabaseInteraction extends App {

  // Configure Spark
  val conf = new SparkConf()
    .setAppName("couchbaseQuickstart")
    .setMaster("local[*]")
    .set("com.couchbase.bucket.default", "")

  // Generate The Context
  val sc = new SparkContext(conf)

  val rawSquawks: RDD[JsonDocument] = sc.couchbaseView(
    ViewQuery.from("squawks", "by_squawk_id"))
    .map(_.id)
    .couchbaseGet[JsonDocument]()

  rawSquawks.foreach(println)


  def extract(rawSquawks: RDD[JsonDocument]): RDD[IntFeature] = {
    ???
  }

  def transform(inputFeatures: RDD[IntFeature]): RDD[BooleanFeature] = {
    ???
  }

  val trainableFeatures = transform(extract(rawSquawks))
} 
Example 61
Source File: TestableQueueInputDStream.scala    From SparkUnitTestingExamples   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming

import java.io.{ObjectInputStream, ObjectOutputStream}

import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.streaming.dstream.InputDStream

import scala.collection.mutable.{ArrayBuffer, Queue}
import scala.reflect.ClassTag

class TestableQueueInputDStream[T: ClassTag](
                                              ssc: StreamingContext,
                                              val queue: Queue[RDD[T]],
                                              oneAtATime: Boolean,
                                              defaultRDD: RDD[T]
                                              ) extends InputDStream[T](ssc) {

  override def start() { }

  override def stop() { }

  private def readObject(in: ObjectInputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  private def writeObject(oos: ObjectOutputStream): Unit = {
    logWarning("queueStream doesn't support checkpointing")
  }

  override def compute(validTime: Time): Option[RDD[T]] = {
    val buffer = new ArrayBuffer[RDD[T]]()
    queue.synchronized {
      if (oneAtATime && queue.nonEmpty) {
        buffer += queue.dequeue()
      } else {
        buffer ++= queue
        queue.clear()
      }
    }
    if (buffer.nonEmpty) {
      if (oneAtATime) {
        Some(buffer.head)
      } else {
        Some(new UnionRDD(context.sc, buffer.toSeq))
      }
    } else if (defaultRDD != null) {
      Some(defaultRDD)
    } else {
      Some(ssc.sparkContext.emptyRDD)
    }
  }

} 
Example 62
Source File: StreamingUnitTest.scala    From SparkUnitTestingExamples   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.sa.spark.unittest.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

import scala.collection.mutable.Queue

class StreamingUnitTest extends FunSuite with
BeforeAndAfterEach with BeforeAndAfterAll{

  @transient var sc: SparkContext = null
  @transient var ssc: StreamingContext = null

  override def beforeAll(): Unit = {

    val envMap = Map[String,String](("Xmx", "512m"))

    val sparkConfig = new SparkConf()
    sparkConfig.set("spark.broadcast.compress", "false")
    sparkConfig.set("spark.shuffle.compress", "false")
    sparkConfig.set("spark.shuffle.spill.compress", "false")
    sparkConfig.set("spark.io.compression.codec", "lzf")
    sc = new SparkContext("local[2]", "unit test", sparkConfig)
    ssc = new StreamingContext(sc, Milliseconds(200))
  }

  override def afterAll(): Unit = {
    sc.stop()
  }

  test("Streaming word count") {

    val firstBatchRDD = sc.parallelize(Seq("a", "b", "c"))
    val secondBatchRDD = sc.parallelize(Seq("a", "e"))
    val thirdBatchRDD = sc.parallelize(Seq("b", "c", "e", "f"))
    val forthBatchRDD = sc.parallelize(Seq("a", "e"))

    val queue = new Queue[RDD[String]]

    queue.+=(firstBatchRDD)
    queue.+=(secondBatchRDD)
    queue.+=(thirdBatchRDD)
    queue.+=(forthBatchRDD)

    println(queue)

    val startTime = System.currentTimeMillis()

    val dstream = new TestableQueueInputDStream(ssc, queue, true, sc.makeRDD(Seq[String](), 1))
    //ssc.queueStream(queue)

    dstream.checkpoint(Seconds(100))

    val batchTotals:DStream[(String, Int)] = dstream.map(r => (r, 1)).reduceByKey(_ + _)

    val streamTotals = batchTotals.updateStateByKey(
      (seq:Seq[Int], opt:Option[Int]) => {
        if (!seq.isEmpty) {
          val totalCountForNew = seq.reduce(_ + _)
          if (opt.isEmpty) {
            Option(totalCountForNew)
          } else {
            Option(opt.get + totalCountForNew)
          }
        } else {
          opt
        }
    })

    streamTotals.foreachRDD(rdd => {

    })

    ssc.checkpoint("./tmp")
    ssc.start()
    ssc.awaitTerminationOrTimeout(2000)

    val endTime = System.currentTimeMillis()

    val rddList = streamTotals.slice(new Time(startTime), new Time(endTime))

    rddList(0).collect().foreach(println)
    assert(rddList(0).collect().filter(r => r._1.equals("a"))(0)._2 == 1)
    rddList(1).collect().foreach(println)
    assert(rddList(1).collect().filter(r => r._1.equals("a"))(0)._2  == 2)
    rddList(2).collect().foreach(println)
    assert(rddList(2).collect().filter(r => r._1.equals("a"))(0)._2  == 2)
    rddList(3).collect().foreach(println)
    assert(rddList(3).collect().filter(r => r._1.equals("a"))(0)._2  == 3)
  }
} 
Example 63
Source File: SparkCassRDDFunctions.scala    From Spark2Cassandra   with Apache License 2.0 5 votes vote down vote up
package com.github.jparkie.spark.cassandra.rdd

import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.mapper.ColumnMapper
import com.datastax.spark.connector.writer.{ DefaultRowWriter, RowWriterFactory }
import com.datastax.spark.connector.{ AllColumns, ColumnSelector }
import com.github.jparkie.spark.cassandra.SparkCassBulkWriter
import com.github.jparkie.spark.cassandra.conf.{ SparkCassServerConf, SparkCassWriteConf }
import org.apache.spark.rdd.RDD

import scala.reflect.runtime.universe._


  def bulkLoadToCass(
    keyspaceName:        String,
    tableName:           String,
    columns:             ColumnSelector      = AllColumns,
    sparkCassWriteConf:  SparkCassWriteConf  = SparkCassWriteConf.fromSparkConf(internalSparkContext.getConf),
    sparkCassServerConf: SparkCassServerConf = SparkCassServerConf.fromSparkConf(internalSparkContext.getConf)
  )(implicit
    connector: CassandraConnector = CassandraConnector(internalSparkContext.getConf),
    rwf: RowWriterFactory[T] = DefaultRowWriter.factory[T]): Unit = {
    val sparkCassBulkWriter = SparkCassBulkWriter(
      connector,
      keyspaceName,
      tableName,
      columns,
      sparkCassWriteConf,
      sparkCassServerConf
    )

    internalSparkContext.runJob(rdd, sparkCassBulkWriter.write _)
  }
} 
Example 64
Source File: PointCloudRelation.scala    From geotrellis-pointcloud   with Apache License 2.0 5 votes vote down vote up
package geotrellis.pointcloud.spark.datasource

import geotrellis.pointcloud.spark.store.hadoop._
import geotrellis.pointcloud.spark.store.hadoop.HadoopPointCloudRDD.{Options => HadoopOptions}
import geotrellis.pointcloud.util.Filesystem
import geotrellis.proj4.CRS
import geotrellis.store.hadoop.util.HdfsUtils
import geotrellis.vector.Extent

import cats.implicits._
import io.pdal._
import io.circe.syntax._
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SQLContext}

import java.io.File

import scala.collection.JavaConverters._

// This class has to be serializable since it is shipped over the network.
class PointCloudRelation(
  val sqlContext: SQLContext,
  path: String,
  options: HadoopOptions
) extends BaseRelation with TableScan with Serializable {

  @transient implicit lazy val sc: SparkContext = sqlContext.sparkContext

  // TODO: switch between HadoopPointCloudRDD and S3PointcCloudRDD
  lazy val isS3: Boolean = path.startsWith("s3")

  override def schema: StructType = {
    lazy val (local, fixedPath) =
      if(path.startsWith("s3") || path.startsWith("hdfs")) {
        val tmpDir = Filesystem.createDirectory()
        val remotePath = new Path(path)
        // copy remote file into local tmp dir
        val localPath = new File(tmpDir, remotePath.getName)
        HdfsUtils.copyPath(remotePath, new Path(s"file:///${localPath.getAbsolutePath}"), sc.hadoopConfiguration)
        (true, localPath.toString)
      } else (false, path)

    val localPipeline =
      options.pipeline
        .hcursor
        .downField("pipeline").downArray
        .downField("filename").withFocus(_ => fixedPath.asJson)
        .top.fold(options.pipeline)(identity)

    val pl = Pipeline(localPipeline.noSpaces)
    if (pl.validate()) pl.execute()
    val pointCloud = try {
      pl.getPointViews().next().getPointCloud(0)
    } finally {
      pl.close()
      if(local) println(new File(fixedPath).delete)
    }

    val rdd = HadoopPointCloudRDD(new Path(path), options)

    val md: (Option[Extent], Option[CRS]) =
      rdd
        .map { case (header, _) => (header.projectedExtent3D.map(_.extent3d.toExtent), header.crs) }
        .reduce { case ((e1, c), (e2, _)) => ((e1, e2).mapN(_ combine _), c) }

    val metadata = new MetadataBuilder().putString("metadata", md.asJson.noSpaces).build

    pointCloud.deriveSchema(metadata)
  }

  override def buildScan(): RDD[Row] = {
    val rdd = HadoopPointCloudRDD(new Path(path), options)
    rdd.flatMap { _._2.flatMap { pc => pc.readAll.toList.map { k => Row(k: _*) } } }
  }
} 
Example 65
Source File: PointCloudToDem.scala    From geotrellis-pointcloud   with Apache License 2.0 5 votes vote down vote up
package geotrellis.pointcloud.spark.dem

import io.pdal._
import geotrellis.layer._
import geotrellis.raster._
import geotrellis.spark._
import geotrellis.util._
import geotrellis.vector._

import org.apache.spark.rdd.RDD

object PointCloudToDem {
  def apply[M: GetComponent[*, LayoutDefinition]](rdd: RDD[(SpatialKey, PointCloud)] with Metadata[M], tileDimensions: (Int, Int), options: PointToGrid.Options): RDD[(SpatialKey, Tile)] with Metadata[LayoutDefinition] =
    apply[M](rdd, options) { e => RasterExtent(e, tileDimensions._1, tileDimensions._2) }

  def apply[M: GetComponent[*, LayoutDefinition]](rdd: RDD[(SpatialKey, PointCloud)] with Metadata[M], cellSize: CellSize, options: PointToGrid.Options): RDD[(SpatialKey, Tile)] with Metadata[LayoutDefinition] =
   apply[M](rdd, options) { e => RasterExtent(e, cellSize) }

  def apply[M: GetComponent[*, LayoutDefinition]](rdd: RDD[(SpatialKey, PointCloud)] with Metadata[M], options: PointToGrid.Options)(createRE: Extent => RasterExtent): RDD[(SpatialKey, Tile)] with Metadata[LayoutDefinition] = {
    val layoutDefinition = rdd.metadata.getComponent[LayoutDefinition]
    val mapTransform = layoutDefinition.mapTransform

    val result =
      rdd
        .collectNeighbors
        .mapPartitions({ partition =>
          partition.map { case (key, neighbors) =>
            val extent = mapTransform(key)
            val raster =
              PointToGrid.createRaster(neighbors.map(_._2._2), createRE(extent), options)
            (key, raster.tile)
          }
        }, preservesPartitioning = true)

    ContextRDD(result, layoutDefinition)
  }
} 
Example 66
Source File: BufferUnionable.scala    From geotrellis-pointcloud   with Apache License 2.0 5 votes vote down vote up
package geotrellis.pointcloud.spark.buffer

import geotrellis.layer._

import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

object BufferUnionable {

  
  def apply[
    K: SpatialComponent,
    X <: { def union(other: Any): V },
    V: (? => X) : ClassTag
  ](rdd: RDD[(K, V)]): RDD[(K, V)] = {
    rdd
      .flatMap({ case (key, data) =>
        val SpatialKey(col, row) = key

        for (deltaX <- -1 to +1; deltaY <- -1 to +1) yield {
          if (deltaX == 0 && deltaY == 0)
            (SpatialKey(col + deltaX, row + deltaY), (key, data, true))
          else
            (SpatialKey(col + deltaX, row + deltaY), (key, data, false))
        }
      })
      .groupByKey
      .filter({ case (_, seq) => seq.exists { case (_, _, center) => center } })
      .map({ case (sortKey, seq) =>
        val resultKey = seq.filter({ case (_, _, center) => center }).head._1
        val resultValue = seq.map({ case (_, data, _) => data }).reduce(_ union _)

        (resultKey, resultValue)
      })
  }

} 
Example 67
Source File: HadoopPointCloudRDD.scala    From geotrellis-pointcloud   with Apache License 2.0 5 votes vote down vote up
package geotrellis.pointcloud.spark.store.hadoop

import geotrellis.pointcloud.spark.store.hadoop.formats._
import geotrellis.store.hadoop._
import geotrellis.vector.Extent

import io.circe.Json
import io.pdal._
import io.pdal.pipeline._
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


  def apply(path: Path, options: Options = Options.DEFAULT)(implicit sc: SparkContext): RDD[(HadoopPointCloudHeader, List[PointCloud])] = {
    val conf = sc.hadoopConfiguration.withInputDirectory(path, options.filesExtensions)

    options.tmpDir.foreach(PointCloudInputFormat.setTmpDir(conf, _))
    options.dimTypes.foreach(PointCloudInputFormat.setDimTypes(conf, _))
    PointCloudInputFormat.setPipeline(conf, options.pipeline)

    options.filterExtent match {
      case Some(filterExtent) =>
        PointCloudInputFormat.setFilterExtent(conf, filterExtent)

        sc.newAPIHadoopRDD(
          conf,
          classOf[PointCloudInputFormat],
          classOf[HadoopPointCloudHeader],
          classOf[List[PointCloud]]
        ).filter { case (header, _) =>
          header.extent3D.map(_.toExtent.intersects(filterExtent)).getOrElse(false)
        }
      case None =>
        sc.newAPIHadoopRDD(
          conf,
          classOf[PointCloudInputFormat],
          classOf[HadoopPointCloudHeader],
          classOf[List[PointCloud]]
        )
    }
  }
} 
Example 68
Source File: S3PointCloudRDD.scala    From geotrellis-pointcloud   with Apache License 2.0 5 votes vote down vote up
package geotrellis.pointcloud.spark.store.s3

import geotrellis.pointcloud.spark.store.hadoop.formats.PointCloudInputFormat
import geotrellis.spark.store.s3._
import geotrellis.store.s3.S3ClientProducer
import geotrellis.vector.Extent
import io.circe._
import io.pdal._
import io.pdal.pipeline._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import software.amazon.awssdk.services.s3.S3Client


  def apply(bucket: String, prefix: String, options: Options = Options.DEFAULT)(implicit sc: SparkContext): RDD[(S3PointCloudHeader, List[PointCloud])] = {
    val conf = sc.hadoopConfiguration

    S3InputFormat.setBucket(conf, bucket)
    S3InputFormat.setPrefix(conf, prefix)
    S3InputFormat.setExtensions(conf, options.filesExtensions)
    S3InputFormat.setCreateS3Client(conf, options.getClient)
    options.numPartitions.foreach(S3InputFormat.setPartitionCount(conf, _))
    options.partitionBytes.foreach(S3InputFormat.setPartitionBytes(conf, _))

    options.tmpDir.foreach(PointCloudInputFormat.setTmpDir(conf, _))
    options.dimTypes.foreach(PointCloudInputFormat.setDimTypes(conf, _))
    PointCloudInputFormat.setPipeline(conf, options.pipeline)

    options.filterExtent match {
      case Some(filterExtent) =>
        PointCloudInputFormat.setFilterExtent(conf, filterExtent)

        sc.newAPIHadoopRDD(
          conf,
          classOf[S3PointCloudInputFormat],
          classOf[S3PointCloudHeader],
          classOf[List[PointCloud]]
        ).filter { case (header, _) => header.extent3D.exists(_.toExtent.intersects(filterExtent)) }
      case None =>
        sc.newAPIHadoopRDD(
          conf,
          classOf[S3PointCloudInputFormat],
          classOf[S3PointCloudHeader],
          classOf[List[PointCloud]]
        )
    }
  }
} 
Example 69
Source File: MlLibOnKudu.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.hadooparchitecturebook.taxi360.etl.machinelearning.kudu

import com.hadooparchitecturebook.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object MlLibOnKudu {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> " +
        "<kuduMaster> " +
        "<taxiTable> " +
        "<numOfCenters> " +
        "<numOfIterations> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val taxiTable = args(2)
    val numOfCenters = args(3).toInt
    val numOfIterations = args(4).toInt

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val sqlContext = new SQLContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> taxiTable,
      "kudu.master" -> kuduMaster)

    sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load.
      registerTempTable("ny_taxi_trip_tmp")

    //Vector
    val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => {
      val taxiTrip = NyTaxiYellowTripBuilder.build(r)
      generateVectorOnly(taxiTrip)
    })

    println("--Running KMeans")
    val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations)
    println(" > vector centers:")
    clusters.clusterCenters.foreach(v => println(" >> " + v))

    println("--Running corr")
    val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson")
    println(" > corr: " + correlMatrix.toString)

    println("--Running colStats")
    val colStats = Statistics.colStats(vectorRDD)
    println(" > max: " + colStats.max)
    println(" > count: " + colStats.count)
    println(" > mean: " + colStats.mean)
    println(" > min: " + colStats.min)
    println(" > normL1: " + colStats.normL1)
    println(" > normL2: " + colStats.normL2)
    println(" > numNonZeros: " + colStats.numNonzeros)
    println(" > variance: " + colStats.variance)

    //Labeled Points
    
} 
Example 70
Source File: SolRSupport.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.hadooparchitecturebook.taxi360.streaming.ingestion.solr

import java.net.{ConnectException, SocketException}
import java.util

import org.apache.solr.client.solrj.impl.CloudSolrServer
import org.apache.solr.client.solrj.request.UpdateRequest
import org.apache.solr.common.{SolrException, SolrInputDocument}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream


object SolRSupport {
  def indexDStreamOfDocs(zkHost:String,
                         collection:String,
                         batchSize:Int,
                         docDStream:DStream[SolrInputDocument]): Unit ={
    docDStream.foreachRDD(docRdd => {
      indexDoc(zkHost, collection, batchSize, docRdd)
    })
  }

  def indexDoc(zkHost:String,
               collection:String,
               batchSize:Int,
               docRdd:RDD[SolrInputDocument]): Unit = {
    docRdd.foreachPartition(it => {
      val solrServer = CloudSolRServerBuilder.build(zkHost)

      val batch = new util.ArrayList[SolrInputDocument]()

      while (it.hasNext) {
        val inputDoc = it.next()
        batch.add(inputDoc)
        if (batch.size() >= batchSize)
          sendBatchToSolr(solrServer, collection, batch)
      }
      if (!batch.isEmpty())
        sendBatchToSolr(solrServer, collection, batch)
    })
  }

  def sendBatchToSolr( solrServer: CloudSolrServer,
                       collection:String,
                       batch:util.Collection[SolrInputDocument]) {
    val req = new UpdateRequest()
    req.setParam("collection", collection)

    req.add(batch)
    try {
      solrServer.request(req)
    } catch  {
      case e:Exception => {
        if (shouldRetry(e)) {
          try {
            Thread.sleep(2000)
          } catch {
            case e1: InterruptedException => {
              Thread.interrupted()
            }
          }

          try {
            solrServer.request(req)
          } catch {
            case e1: Exception => {

              if (e1.isInstanceOf[RuntimeException]) {
                throw e1.asInstanceOf[RuntimeException]
              } else {
                throw new RuntimeException(e1)
              }
            }
          }
        } else {
          if (e.isInstanceOf[RuntimeException]) {
            throw e.asInstanceOf[RuntimeException]
          } else {
            throw new RuntimeException(e)
          }
        }
      }
    } finally {
      batch.clear()
    }
  }

  def shouldRetry( exc:Exception): Boolean = {
    val rootCause = SolrException.getRootCause(exc)
    rootCause.isInstanceOf[ConnectException] ||
      rootCause.isInstanceOf[SocketException]
  }
} 
Example 71
Source File: HBaseSQLTableScan.scala    From Backup-Repo   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase.execution

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.RangePartitioning
import org.apache.spark.sql.execution.LeafNode
import org.apache.spark.sql.hbase._


@DeveloperApi
case class HBaseSQLTableScan(
                              relation: HBaseRelation,
                              output: Seq[Attribute],
                              result: RDD[Row]) extends LeafNode {
  override def outputPartitioning = {
    var ordering = List[SortOrder]()
    for (key <- relation.partitionKeys) {
      ordering = ordering :+ SortOrder(key, Ascending)
    }
    RangePartitioning(ordering.toSeq, relation.partitions.size)
  }

  override protected def doExecute(): RDD[Row] = result
} 
Example 72
Source File: HBaseShuffledRDD.scala    From Backup-Repo   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hbase

import org.apache.spark._
import org.apache.spark.rdd.{RDD, ShuffledRDD, ShuffledRDDPartition}

class HBaseShuffledRDD (
    prevRdd: RDD[(HBaseRawType, Array[HBaseRawType])],
    part: Partitioner,
    @transient hbPartitions: Seq[HBasePartition] = Nil) extends ShuffledRDD(prevRdd, part){

  override def getPartitions: Array[Partition] = {
    if (hbPartitions==null || hbPartitions.isEmpty) {
      Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i))
    } else {
      // only to be invoked by clients
      hbPartitions.toArray
    }
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    if (hbPartitions==null || hbPartitions.isEmpty) {
      Seq.empty
    } else {
      split.asInstanceOf[HBasePartition].server.map {
        identity[String]
      }.toSeq
    }
  }
} 
Example 73
Source File: RDFS11.scala    From SparkSRE   with Apache License 2.0 5 votes vote down vote up
package com.hj.examples

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object RDFS11 {
  def transitive(rdd:RDD[(String, String)]) = {
    var rddTuple = rdd
    val reverseTuple = rddTuple.map(x => (x._2, x._1))

    var cur = 0L
    var pre = rddTuple.count
    var flag = true
    while (flag) {
      val joined = reverseTuple.join(rddTuple)
      val res = joined.map(x => x._2)
      rddTuple = rddTuple.union(res).distinct
      cur = rddTuple.count
      if(pre == cur) flag = false
      pre = cur
    }
    rddTuple
  }

  def main(args: Array[String]): Unit = {
    if(args.length != 2) {
      System.out.println("Arguments are invalid! \nExample: <input_path> <output_path>")
      System.exit(1)
    }
    val inputPath = args(0)
    val outputPath = args(1)

    val conf = new SparkConf().setAppName("RDFS11").setMaster("local[2]")
    val sc = new SparkContext(conf)

    val lines = sc.textFile(inputPath)

    val triples = lines.map(x => {
      val arr = x.split(" ")
      (arr(0), arr(1), arr(2))
    })

    

    var subClass = triples.filter(x => x._2.equals("rdfs:subClassOf")).map(x => (x._1, x._3))
    subClass = transitive(subClass)

    subClass.foreach(x => println(x))
    subClass.saveAsTextFile(outputPath)
  }
} 
Example 74
Source File: RDFS5.scala    From SparkSRE   with Apache License 2.0 5 votes vote down vote up
package com.hj.examples

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object RDFS5 {
  def transitive(rdd:RDD[(String, String)]) = {
    var rddTuple = rdd
    val reverseTuple = rddTuple.map(x => (x._2, x._1))

    var cur = 0L
    var pre = rddTuple.count
    var flag = true
    while (flag) {
      val joined = reverseTuple.join(rddTuple)
      val res = joined.map(x => x._2)
      rddTuple = rddTuple.union(res).distinct
      cur = rddTuple.count
      if(pre == cur) flag = false
      pre = cur
    }
    rddTuple
  }

  def main(args: Array[String]): Unit = {
    if(args.length != 2) {
      System.out.println("Arguments are invalid! \nExample: <input_path> <output_path>")
      System.exit(1)
    }
    val inputPath = args(0)
    val outputPath = args(1)

    val conf = new SparkConf().setAppName("RDFS5").setMaster("local[2]")
    val sc = new SparkContext(conf)

    val lines = sc.textFile(inputPath)

    val triples = lines.map(x => {
      val arr = x.split(" ")
      (arr(0), arr(1), arr(2))
    })

    

    var subProp = triples.filter(x => x._2.equals("rdfs:subPropertyOf")).map(x => (x._1, x._3))
    subProp = transitive(subProp)

    subProp.foreach(x => println(x))
    subProp.saveAsTextFile(outputPath)
  }
} 
Example 75
Source File: DFConverter.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql

import com.twosigma.flint.rdd.OrderedRDD
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.StructType


object DFConverter {

  def newDataFrame(df: DataFrame): DataFrame = {
    new DataFrame(df.sparkSession, df.logicalPlan, RowEncoder(df.schema))
  }

  def toDataFrame(rdd: OrderedRDD[Long, InternalRow], schema: StructType): DataFrame = {
    val spark = SparkSession.builder().getOrCreate()
    val internalRows = rdd.values
    spark.internalCreateDataFrame(internalRows, schema)
  }

  def toDataFrame(rdd: RDD[InternalRow], schema: StructType): DataFrame = {
    val spark = SparkSession.builder().getOrCreate()
    spark.internalCreateDataFrame(rdd, schema)
  }

} 
Example 76
Source File: WeightedLabeledPoint.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.math.stats.regression

import breeze.linalg.DenseVector
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext

case class WeightedLabeledPoint(label: Double, weight: Double, features: DenseVector[Double]) {
  
  def generateSampleData(sc: SparkContext, weights: DenseVector[Double], intercept: Double,
    numRows: Long = 100L, numPartitions: Int = 4, errorScalar: Double = 1.0,
    seed: Long = 1L): RDD[WeightedLabeledPoint] = {
    val len = weights.length + 2
    // The last entry will serve as the weight of point and the second last entry will serve
    // as noisy of the label.
    val data = RandomRDDs.normalVectorRDD(sc, numRows, len, numPartitions, seed)
    data.map { d =>
      val fw = d.toArray
      val x = new DenseVector(fw.dropRight(2))
      WeightedLabeledPoint(
        weights.dot(x) + intercept + errorScalar * fw(len - 2),
        Math.abs(fw(len - 1)) + 0.5, x
      )
    }
  }
} 
Example 77
Source File: OLSMultipleLinearRegression.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.math.stats.regression

import org.apache.spark.rdd.RDD
import breeze.linalg.{ DenseMatrix, DenseVector }

object OLSMultipleLinearRegression {

  
  def regression(input: RDD[WeightedLabeledPoint], intercept: Boolean = true): LinearRegressionModel = {
    // Try to get the number of columns
    val nCols = if (intercept) {
      input.first.features.length + 1
    } else {
      input.first.features.length
    }

    val (xx, xy, swx, srwsl, ssrw, wsl, sw, n, lw) = input.treeAggregate((
      new DenseMatrix[Double](nCols, nCols), // 1. Calculate a k-by-k matrix X^TX.
      new DenseVector[Double](nCols), // 2. Calculate a k-dimension vector X^Ty.
      new DenseVector[Double](nCols), // 3. Calculate a k-dimension vector of weighted sum of X.
      0.0, // 4. Calculate the square root weighted sum of labels.
      0.0, // 5. Calculate the sum of square root of weights.
      0.0, // 6. Calculate the weighted sum of labels.
      0.0, // 7. Calculate the sum of weights.
      0: Long, // 8. Calculate the length of input.
      0.0 // 9. Calculate sum of log weights
    ))(
      // U is a pair of matrix and vector and v is a WeightedLabeledPoint.
      seqOp = (U, v) => {
      // Append 1.0 at the head for calculating intercept.
      val x = if (intercept) {
        DenseVector.vertcat(DenseVector(1.0), v.features)
      } else {
        v.features
      }
      val wx = x * v.weight
      val sqrtW = Math sqrt v.weight
      // Unfortunately, breeze.linalg.DenseVector does not support tensor product.
      (U._1 += wx.asDenseMatrix.t * x.asDenseMatrix,
        U._2 += wx * v.label,
        U._3 += wx,
        U._4 + v.label * sqrtW,
        U._5 + sqrtW,
        U._6 + v.label * v.weight,
        U._7 + v.weight,
        U._8 + 1,
        U._9 + math.log(v.weight))
    }, combOp = (U1, U2) => (
      U1._1 += U2._1,
      U1._2 += U2._2,
      U1._3 += U2._3,
      U1._4 + U2._4,
      U1._5 + U2._5,
      U1._6 + U2._6,
      U1._7 + U2._7,
      U1._8 + U2._8,
      U1._9 + U2._9
    )
    )
    LinearRegressionModel(input, intercept, n, (xx + xx.t) :/ 2.0, xy, swx, srwsl, ssrw, wsl, sw, lw)
  }
} 
Example 78
Source File: PartitionsIterator.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd

import grizzled.slf4j.Logger

import org.apache.spark.rdd.RDD
import org.apache.spark.{ Partition, TaskContext }

protected[flint] object PartitionsIterator {
  val logger = Logger(PartitionsIterator.getClass)

  def apply[T](
    rdd: RDD[T],
    partitions: Seq[Partition],
    context: TaskContext,
    preservesPartitionsOrdering: Boolean = false // FIXME: This is a band-aid which should be fixed.
  ): PartitionsIterator[T] = new PartitionsIterator(rdd, partitions, context, preservesPartitionsOrdering)
}


  def headPartitionIndex: Int = curPart.index
} 
Example 79
Source File: TreeReduce.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd.function.summarize

import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

object TreeReduce {

  
  def apply[T: ClassTag](
    rdd: RDD[T]
  )(
    f: (T, T) => T,
    depth: Int = 2
  ): T = {
    require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.")

    val reducePartition: Iterator[T] => Option[T] = iter => {
      if (iter.hasNext) {
        Some(iter.reduceLeft(f))
      } else {
        None
      }
    }

    val partiallyReduced = rdd.mapPartitions(it => Iterator(reducePartition(it)))

    val op: (Option[T], Option[T]) => Option[T] = (c, x) => {
      if (c.isDefined && x.isDefined) {
        Some(f(c.get, x.get))
      } else if (c.isDefined) {
        c
      } else if (x.isDefined) {
        x
      } else {
        None
      }
    }

    TreeAggregate(partiallyReduced)(Option.empty[T], op, op, depth).getOrElse(
      sys.error("Empty collection.")
    )
  }

} 
Example 80
Source File: PythonUtils.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd

import com.twosigma.flint.timeseries.{ TimeSeriesRDD, TimeSeriesRDDImpl }
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.types._
import org.apache.spark.sql.{ CatalystTypeConvertersWrapper, Row }

private[rdd] case class SchemaColumnInfo(idx: Int, clazz: Class[_ <: Ordered[_]], dataType: DataType)

case class TimeSeriesRDDWithSchema(rdd: TimeSeriesRDDImpl, schema: StructType)

object PythonUtils {
  
  def fromUnsortedRDD(
    sc: SparkContext,
    rdd: RDD[Row],
    schema: StructType,
    keyColumn: String
  ): TimeSeriesRDDImpl = {
    val orderedRdd = OrderedRDD.fromRDD(formatRDD[Long](rdd, schema, keyColumn), KeyPartitioningType.UnSorted)
    TimeSeriesRDD.fromOrderedRDD(orderedRdd, schema).asInstanceOf[TimeSeriesRDDImpl]
  }

  def toOrderedRDD(
    rdd: RDD[Row],
    schema: StructType,
    keyColumn: String,
    ranges: Seq[CloseOpen[Long]]
  ): OrderedRDD[Long, InternalRow] = {
    val keyIdx = schema.fieldIndex(keyColumn)
    val converter = CatalystTypeConvertersWrapper.toCatalystRowConverter(schema)
    OrderedRDD.fromRDD(rdd.map(row => (row.getAs[Long](keyIdx), converter(row))), ranges)
  }
} 
Example 81
Source File: TimeSeriesRDDConversionSpec.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.timeseries

import java.util.concurrent.TimeUnit

import com.twosigma.flint.timeseries.row.Schema
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{ SQLContext, DataFrame, Row }
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.expressions.{ GenericRowWithSchema => ExternalRow }
import org.scalatest.tagobjects.Slow

class TimeSeriesRDDConversionSpec extends TimeSeriesSuite {

  // The largest prime < 100
  override val defaultPartitionParallelism = 97

  // The 10000-th prime.
  private val defaultNumRows = 104729

  private def createDataFrame(isSorted: Boolean = true)(implicit sqlContext: SQLContext): DataFrame = {
    val n = defaultNumRows
    val schema = Schema("value" -> DoubleType)
    val rdd: RDD[Row] = sqlContext.sparkContext.parallelize(1 to n, defaultPartitionParallelism).map { i =>
      val data: Array[Any] = if (isSorted) {
        Array((i / 100).toLong, i.toDouble)
      } else {
        Array(((i + 1 - n) / 100).toLong, i.toDouble)
      }
      new ExternalRow(data, schema)
    }
    sqlContext.createDataFrame(rdd, schema)
  }

  "TimeSeriesRDD" should "convert from a sorted DataFrame correctly" taggedAs (Slow) in {
    implicit val _sqlContext = sqlContext
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = true))(isSorted = true, TimeUnit.NANOSECONDS)
        assert(tsRdd.count() == defaultNumRows)
    }
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = true))(isSorted = false, TimeUnit.NANOSECONDS)
        assert(tsRdd.count() == defaultNumRows)
    }
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = false))(isSorted = false, TimeUnit.NANOSECONDS)
        assert(tsRdd.count() == defaultNumRows)
    }
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(
          createDataFrame(isSorted = false).sort("time")
        )(
            isSorted = true, TimeUnit.NANOSECONDS
          )
        assert(tsRdd.count() == defaultNumRows)
    }
  }
} 
Example 82
Source File: ParallelCollectionRDD.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd

import org.apache.spark.rdd.RDD
import org.apache.spark.{ Partition, SparkContext, TaskContext }

import scala.reflect.ClassTag



case class ParallelCollectionRDDPartition[T: ClassTag](
  override val index: Int,
  values: Seq[T]
) extends Partition

class ParallelCollectionRDD[T: ClassTag](
  sc: SparkContext,
  @transient data: Seq[Seq[T]]
) extends RDD[T](sc, Nil) {
  override def compute(split: Partition, context: TaskContext): Iterator[T] =
    split.asInstanceOf[ParallelCollectionRDDPartition[T]].values.iterator

  override protected def getPartitions: Array[Partition] =
    data.zipWithIndex.map {
      case (d, index) =>
        ParallelCollectionRDDPartition(index, d)
    }.toArray
} 
Example 83
Source File: OverlappedOrderedRDDSpec.scala    From flint   with Apache License 2.0 5 votes vote down vote up
package com.twosigma.flint.rdd

import com.twosigma.flint.SharedSparkContext
import org.apache.spark.rdd.RDD
import org.scalatest.FlatSpec

class OverlappedOrderedRDDSpec extends FlatSpec with SharedSparkContext {

  val numSlices: Int = 3

  val sliceLength: Int = 4

  var rdd: RDD[(Int, Int)] = _

  var orderedRdd: OrderedRDD[Int, Int] = _

  var overlappedOrderedRdd: OverlappedOrderedRDD[Int, Int] = _

  private def window(t: Int): (Int, Int) = (t - 2, t)

  override def beforeAll() {
    super.beforeAll()
    val s = sliceLength
    rdd = sc.parallelize(0 until numSlices, numSlices).flatMap {
      i => (1 to s).map { j => i * s + j }
    }.map { x => (x, x) }
    orderedRdd = OrderedRDD.fromRDD(rdd, KeyPartitioningType.Sorted)
    overlappedOrderedRdd = OverlappedOrderedRDD(orderedRdd, window)
  }

  "The OverlappedOrderedRDD" should "be constructed from `OrderedRDD` correctly" in {
    assert(overlappedOrderedRdd.rangeSplits.deep == orderedRdd.rangeSplits.deep)
    val benchmark = Array(1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 8, 9, 10, 11, 12).map { x => (x, x) }
    assert(overlappedOrderedRdd.collect().deep == benchmark.deep)
  }

  it should "be able to remove overlapped rows to get an `OrderedRDD` correctly" in {
    assert(overlappedOrderedRdd.rangeSplits.deep == orderedRdd.rangeSplits.deep)
    assert(overlappedOrderedRdd.nonOverlapped().collect().deep == orderedRdd.collect().deep)
  }

  it should "`mapPartitionsWithIndexOverlapped` correctly" in {
    val mapped = overlappedOrderedRdd.mapPartitionsWithIndexOverlapped(
      (index, iterator) => iterator.map { case (k, v) => (k, v * 2) }
    )
    val benchmark = Array(1, 2, 3, 4, 5, 4, 5, 6, 7, 8, 9, 8, 9, 10, 11, 12).map { x => (x, 2 * x) }
    assert(mapped.collect().deep == benchmark.deep)
  }
} 
Example 84
Source File: RDDKafkaWriter.scala    From spark-kafka-writer   with Apache License 2.0 5 votes vote down vote up
package com.github.benfradet.spark.kafka.writer

import org.apache.kafka.clients.producer.{Callback, ProducerRecord}
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag


  override def writeToKafka[K, V](
    producerConfig: Map[String, Object],
    transformFunc: T => ProducerRecord[K, V],
    callback: Option[Callback] = None
  ): Unit =
    rdd.foreachPartition { partition =>
      val producer = KafkaProducerCache.getProducer[K, V](producerConfig)
      partition
        .map(transformFunc)
        .foreach(record => producer.send(record, callback.orNull))
    }
} 
Example 85
Source File: DStreamKafkaWriterSpec.scala    From spark-kafka-writer   with Apache License 2.0 5 votes vote down vote up
package com.github.benfradet.spark.kafka.writer

import org.apache.kafka.clients.producer._
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream

import scala.collection.mutable
import scala.concurrent.duration._

class DStreamKafkaWriterSpec extends SKRSpec {

  "a DStreamKafkaWriter" when {
    "given a dstream" should {
      "write its content to Kafka" in {
        val localTopic = topic
        val msgs = (1 to 10).map(_.toString)
        val stream = createDStream(msgs)
        stream.writeToKafka(
          producerConfig,
          s => new ProducerRecord[String, String](localTopic, s)
        )

        val results = collect(ssc, localTopic)

        ssc.start()
        eventually(timeout(30.seconds), interval(1.second)) {
          results shouldBe msgs
        }
      }

      "trigger a given callback for every write to Kafka" in {
        val localTopic = topic
        val msgs = (1 to 10).map(_.toString)
        val stream = createDStream(msgs)
        stream.writeToKafka(
          producerConfig,
          s => new ProducerRecord[String, String](localTopic, s),
          Some(new Callback with Serializable {
            override def onCompletion(metadata: RecordMetadata, exception: Exception): Unit = {
              SKRSpec.callbackTriggerCount.incrementAndGet()
            }
          })
        )

        ssc.start()
        eventually(timeout(30.seconds), interval(1.second)) {
          SKRSpec.callbackTriggerCount.get() shouldBe msgs.size
        }
      }
    }
  }

  private def createDStream(seq: Seq[String]): DStream[String] = {
    val q = mutable.Queue.empty[RDD[String]]
    q.enqueue(ssc.sparkContext.makeRDD(seq))
    ssc.queueStream(q)
  }
} 
Example 86
Source File: StreamingExample.scala    From reactiveinflux-spark   with Apache License 2.0 5 votes vote down vote up
package com.pygmalios.reactiveinflux.spark.examples

import com.pygmalios.reactiveinflux._
import com.pygmalios.reactiveinflux.spark._
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.joda.time.DateTime

import scala.concurrent.duration._


object StreamingExample extends App {
  val conf = new SparkConf()
    .setMaster("local[*]")
    .setAppName("Example")
  val ssc = new StreamingContext(conf, Seconds(1))

  val point1 = Point(
    time        = DateTime.now(),
    measurement = "measurement1",
    tags        = Map(
      "tagKey1" -> "tagValue1",
      "tagKey2" -> "tagValue2"),
    fields      = Map(
      "fieldKey1" -> "fieldValue1",
      "fieldKey2" -> 10.7)
  )

  // Provide settings for reactiveinflux
  implicit val params = ReactiveInfluxDbName("example")
  implicit val awaitAtMost = 1.second

  // Create DStream of Influx points
  val queue = new scala.collection.mutable.Queue[RDD[Point]]
  val queueStream: DStream[Point] = ssc.queueStream(queue)

  // Add single RDD with a single Influx point to the DStream
  queue.enqueue(ssc.sparkContext.parallelize(Seq(point1)))

  // Save DStream to Influx
  queueStream.saveToInflux()

  // Start Spark streaming
  ssc.start()
  ssc.awaitTermination()
} 
Example 87
Source File: Example.scala    From reactiveinflux-spark   with Apache License 2.0 5 votes vote down vote up
package com.pygmalios.reactiveinflux.spark.examples

import com.pygmalios.reactiveinflux._
import com.pygmalios.reactiveinflux.spark._
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.joda.time.DateTime

import scala.concurrent.duration._


object Example extends App {
  val conf = new SparkConf()
    .setMaster("local[*]")
    .setAppName("Example")
  val sc = new SparkContext(conf)

  val point1 = Point(
    time        = DateTime.now(),
    measurement = "measurement1",
    tags        = Map(
      "tagKey1" -> "tagValue1",
      "tagKey2" -> "tagValue2"),
    fields      = Map(
      "fieldKey1" -> "fieldValue1",
      "fieldKey2" -> 10.7)
  )

  // Provide settings for reactiveinflux
  implicit val params = ReactiveInfluxDbName("example")
  implicit val awaitAtMost = 1.second

  // Create RDD with Influx point
  val rdd: RDD[Point] = sc.parallelize(Seq(point1))

  // Save RDD to Influx
  rdd.saveToInflux()

  // Stop Spark context
  sc.stop()
} 
Example 88
Source File: PointRDDExtensions.scala    From reactiveinflux-spark   with Apache License 2.0 5 votes vote down vote up
package com.pygmalios.reactiveinflux.spark.extensions

import com.pygmalios.reactiveinflux.spark.config.ReactiveInfluxSparkConfig
import com.pygmalios.reactiveinflux.spark.{RDDExtensions, _}
import com.pygmalios.reactiveinflux.{PointNoTime, ReactiveInfluxDbName}
import org.apache.spark.rdd.RDD
import org.slf4j.LoggerFactory

import scala.concurrent.duration.Duration

private[spark] class PointRDDExtensions[+T <: PointNoTime](rdd: RDD[T]) extends RDDExtensions[T] {
  import PointRDDExtensions._

  override def saveToInflux()(implicit reactiveInfluxDbName: ReactiveInfluxDbName,
                              awaitAtMost: Duration): Unit = {
    // Process each partition separately
    totalBatchCount = 0
    totalPointCount = 0
    rdd.foreachPartition { partition =>
      withInflux { db =>
        val batchSize = ReactiveInfluxSparkConfig(db.config).sparkBatchSize

        // Write points in batches
        var batchCount = 0
        var pointCount = 0
        partition.sliding(batchSize, batchSize).foreach { batch =>
          // Write single batch
          db.write(batch)

          // Statistics for logging
          batchCount += 1
          pointCount += batch.size
        }

        totalBatchCount += batchCount
        totalPointCount += pointCount

        log.debug(s"Partition with $pointCount points written to Influx in $batchCount batches.")
      }
    }
    log.info(s"RDD with ${rdd.partitions.size} partitions and $totalPointCount points written to Influx in $totalBatchCount batches.")
  }
}

object PointRDDExtensions {
  private val log = LoggerFactory.getLogger(classOf[PointRDDExtensions[_]])

  // This makes sense for testing purposes only
  private[reactiveinflux] var totalBatchCount = 0
  private[reactiveinflux] var totalPointCount = 0
} 
Example 89
package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.PrefixSpan
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object MSNBCStreamingExample extends App {

    val conf = new SparkConf()
      .setAppName("MSNBC data initial streaming example")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, batchDuration = Seconds(10))

    val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)
    }
    val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache()
    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel = prefixSpan.run(trainSequences)
    val freqSequences = psModel.freqSequences.map(_.sequence).collect()


    val rawSequences: DStream[String] = ssc.socketTextStream("localhost", 9999)

    val sequences: DStream[Array[Array[Int]]] = rawSequences
      .map(line => line.split(" ").map(_.toInt))
      .map(_.map(Array(_)))

    print(">>> Analysing new batch of data")
    sequences.foreachRDD(
      rdd => rdd.foreach(
        array => {
          println(">>> Sequence: ")
          println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"))
          freqSequences.count(_.deep == array.deep) match {
            case count if count > 0 => println("is frequent!")
            case _ => println("is not frequent.")
          }
        }
      )
    )
    print(">>> done")

    ssc.start()
    ssc.awaitTermination()

} 
Example 90
package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.{FPGrowth, PrefixSpan}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}


object MSNBCPatternMining extends App {

    val conf = new SparkConf()
      .setAppName("MSNBC.com data pattern mining")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)

    val transactionTest = sc.parallelize(Array(Array("A", "B", "C"), Array("B", "C", "A")))
    val fp = new FPGrowth().setMinSupport(0.8).setNumPartitions(5)
    fp.run(transactionTest)

    val transactions: RDD[Array[Int]] = sc.textFile("./msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)
    }

    // NOTE: Caching data is recommended
    val uniqueTransactions: RDD[Array[Int]] = transactions.map(_.distinct).cache()


    val fpGrowth = new FPGrowth().setMinSupport(0.01)
    val model = fpGrowth.run(uniqueTransactions)
    val count = uniqueTransactions.count()

    model.freqItemsets.collect().foreach { itemset =>
      if (itemset.items.length >= 3)
        println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq / count.toDouble )
    }

    val rules = model.generateAssociationRules(confidence = 0.4)
    rules.collect().foreach { rule =>
      println("[" + rule.antecedent.mkString(",") + "=>"
        + rule.consequent.mkString(",") + "]," + (100 * rule.confidence).round / 100.0)
    }

    val frontPageConseqRules = rules.filter(_.consequent.head == 1)
    frontPageConseqRules.count
    frontPageConseqRules.filter(_.antecedent.contains(2)).count
    rules.filter(_.antecedent.contains(7)).count


    val sequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache()

    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel = prefixSpan.run(sequences)

    psModel.freqSequences.map(fs => (fs.sequence.length, 1))
      .reduceByKey(_ + _)
      .sortByKey()
      .collect()
      .foreach(fs => println(s"${fs._1}: ${fs._2}"))

    psModel.freqSequences
      .map(fs => (fs.sequence.length, fs))
      .groupByKey()
      .map(group => group._2.reduce((f1, f2) => if (f1.freq > f2.freq) f1 else f2))
      .map(_.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"))
      .collect.foreach(println)


    psModel.freqSequences
      .map(fs => (fs.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"), 1))
      .reduceByKey(_ + _)
      .reduce( (f1, f2) => if (f1._2 > f2._2) f1 else f2 )


    psModel.freqSequences.reduce( (f1, f2) => if (f1.freq > f2.freq) f1 else f2 )
    psModel.freqSequences.filter(_.sequence.length == 1).map(_.sequence.toString).collect.foreach(println)

    psModel.freqSequences.collect().foreach {
      freqSequence =>
        println(
          freqSequence.sequence.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]") + ", " + freqSequence.freq
        )
    }
} 
Example 91
package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.PrefixSpan
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object MSNBCStreamingAdvanced extends App {

    val conf = new SparkConf()
      .setAppName("MSNBC data initial streaming example")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, batchDuration = Seconds(10))

    val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)
    }
    val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache()
    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel = prefixSpan.run(trainSequences)
    val freqSequences = psModel.freqSequences.map(_.sequence).collect()


    val rawEvents: DStream[String] = ssc.socketTextStream("localhost", 9999)

    val events: DStream[(Int, String)] = rawEvents.map(line => line.split(": "))
        .map(kv => (kv(0).toInt, kv(1)))

    val countIds = events.map(e => (e._1, 1))
    val counts: DStream[(Int, Int)] = countIds.reduceByKey(_ + _)

    def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {
      Some(runningCount.getOrElse(0) + newValues.sum)
    }
    val runningCounts = countIds.updateStateByKey[Int](updateFunction _)

    val duration = Seconds(20)
    val slide = Seconds(10)

    val rawSequences: DStream[(Int, String)] = events
      .reduceByKeyAndWindow((v1: String, v2: String) => v1 + " " + v2, duration, slide)

    val sequences: DStream[Array[Array[Int]]] = rawSequences.map(_._2)
      .map(line => line.split(" ").map(_.toInt))
      .map(_.map(Array(_)))


    print(">>> Analysing new batch of data")
    sequences.foreachRDD(
      rdd => rdd.foreach(
        array => {
          println(">>> Sequence: ")
          println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"))
          freqSequences.count(_.deep == array.deep) match {
            case count if count > 0 => println("is frequent!")
            case _ => println("is not frequent.")
          }
        }
      )
    )
    print(">>> done")

    ssc.start()
    ssc.awaitTermination()
} 
Example 92
Source File: GraphFromRdd.scala    From Mastering-Machine-Learning-with-Spark-2.x   with MIT License 5 votes vote down vote up
package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}



object GraphFromRdd extends App {

     val conf = new SparkConf()
       .setAppName("RDD graph")
       .setMaster("local[4]")
     val sc = new SparkContext(conf)

     val vertices: RDD[(VertexId, String)] = sc.parallelize(
       Array((1L, "Anne"),
         (2L, "Bernie"),
         (3L, "Chris"),
         (4L, "Don"),
         (5L, "Edgar")))

     val edges: RDD[Edge[String]] = sc.parallelize(
       Array(Edge(1L, 2L, "likes"),
         Edge(2L, 3L, "trusts"),
         Edge(3L, 4L, "believes"),
         Edge(4L, 5L, "worships"),
         Edge(1L, 3L, "loves"),
         Edge(4L, 1L, "dislikes")))

     val friendGraph: Graph[String, String] = Graph(vertices, edges)
     friendGraph.vertices.collect.foreach(println)

     friendGraph.edges.map( e => e.srcId > e.dstId ).count()

     val mappedEdgeGraph: Graph[String, Boolean] = friendGraph.mapEdges( e => e.srcId > e.dstId )

     val inDegVertexRdd: VertexRDD[Int] = friendGraph.aggregateMessages[Int](
       sendMsg = ec => ec.sendToDst(1),
       mergeMsg = (msg1, msg2) => msg1+msg2
     )
     assert(inDegVertexRdd.collect.deep == friendGraph.inDegrees.collect.deep)

     friendGraph.staticPageRank(numIter = 10).vertices.collect.foreach(println)
     friendGraph.pageRank(tol = 0.0001, resetProb = 0.15)

} 
Example 93
package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
//import org.graphframes._


object GraphFramesExample extends App {

    val conf = new SparkConf()
      .setAppName("RDD graph")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)


    val vertices: RDD[(VertexId, String)] = sc.parallelize(
      Array((1L, "Anne"),
        (2L, "Bernie"),
        (3L, "Chris"),
        (4L, "Don"),
        (5L, "Edgar")))

    val edges: RDD[Edge[String]] = sc.parallelize(
      Array(Edge(1L, 2L, "likes"),
        Edge(2L, 3L, "trusts"),
        Edge(3L, 4L, "believes"),
        Edge(4L, 5L, "worships"),
        Edge(1L, 3L, "loves"),
        Edge(4L, 1L, "dislikes")))

    val friendGraph: Graph[String, String] = Graph(vertices, edges)

//    val friendGraphFrame = GraphFrame.fromGraphX(friendGraph)
//
//    friendGraphFrame.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3)").filter(
//      "e1.attr = 'trusts' OR v3.attr = 'Chris'"
//    ).collect.foreach(println)

} 
Example 94
Source File: GephiApp.scala    From Mastering-Machine-Learning-with-Spark-2.x   with MIT License 5 votes vote down vote up
package com.github.maxpumperla.ml_spark.graphs

import java.io.PrintWriter

import com.github.maxpumperla.ml_spark.utils.Gephi.toGexf
import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

object GephiApp extends App {

    val conf = new SparkConf()
      .setAppName("Gephi Test Writer")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)

    val vertices: RDD[(VertexId, String)] = sc.parallelize(
      Array((1L, "Anne"),
        (2L, "Bernie"),
        (3L, "Chris"),
        (4L, "Don"),
        (5L, "Edgar")))

    val edges: RDD[Edge[String]] = sc.parallelize(
      Array(Edge(1L, 2L, "likes"),
        Edge(2L, 3L, "trusts"),
        Edge(3L, 4L, "believes"),
        Edge(4L, 5L, "worships"),
        Edge(1L, 3L, "loves"),
        Edge(4L, 1L, "dislikes")))

    val graph: Graph[String, String] = Graph(vertices, edges)

    val pw = new PrintWriter("./graph.gexf")
    pw.write(toGexf(graph))
    pw.close()
} 
Example 95
Source File: DCollectionGenProperties.scala    From kontextfrei   with Apache License 2.0 5 votes vote down vote up
package com.danielwestheide.kontextfrei.scalatest

import org.apache.spark.rdd.RDD
import org.scalatest.PropSpecLike
import org.scalatest.prop.GeneratorDrivenPropertyChecks

trait DCollectionGenProperties[DColl[_]]
    extends PropSpecLike
    with GeneratorDrivenPropertyChecks
    with DCollectionGen
    with KontextfreiSpec[DColl] {

  property("Can get arbitrary DCollections") {
    forAll { xs: DColl[String] =>
      ops.count(xs) === ops.collectAsArray(xs).length
    }
  }

}

class DCollectionGenStreamSpec
    extends DCollectionGenProperties[Stream]
    with StreamSpec
class DCollectionGenRDDSpec extends DCollectionGenProperties[RDD] with RDDSpec 
Example 96
Source File: CollectingInstancesProperties.scala    From kontextfrei   with Apache License 2.0 5 votes vote down vote up
package com.danielwestheide.kontextfrei.scalatest

import org.apache.spark.rdd.RDD
import org.scalatest.enablers.Collecting
import org.scalatest.{Inspectors, PropSpec, PropSpecLike}
import org.scalatest.prop.GeneratorDrivenPropertyChecks

trait CollectingInstancesProperties[DColl[_]]
    extends PropSpecLike
    with GeneratorDrivenPropertyChecks
    with KontextfreiSpec[DColl]
    with CollectingInstances {

  property("There is a Collecting instance for DCollection") {
    forAll { (xs: List[String]) =>
      val dcoll = ops.unit(xs)
      Inspectors.forAll(dcoll) { x =>
        assert(xs.contains(x))
      }
    }
  }

  property(
    "Collecting nature of DCollection returns the original size of the input sequence") {
    forAll { (xs: List[String]) =>
      val dcoll = ops.unit(xs)
      assert(
        implicitly[Collecting[String, DColl[String]]]
          .sizeOf(dcoll) === xs.size)
    }
  }

  property(
    "Collecting nature of DCollection returns the Some loneElement if input sequence has exactly one element") {
    forAll { (x: String) =>
      val dcoll = ops.unit(List(x))
      assert(
        implicitly[Collecting[String, DColl[String]]]
          .loneElementOf(dcoll) === Some(x))
    }
  }

  property(
    "Collecting nature of DCollection returns the None as loneElement if input sequence as more than one element") {
    forAll { (xs: List[String]) =>
      whenever(xs.size > 1) {
        val dcoll = ops.unit(xs)
        assert(
          implicitly[Collecting[String, DColl[String]]]
            .loneElementOf(dcoll)
            .isEmpty)
      }
    }
  }

  property(
    "Collecting nature of DCollection returns the None as loneElement if input sequence is empty") {
    val dcoll = ops.unit(List.empty[String])
    assert(
      implicitly[Collecting[String, DColl[String]]]
        .loneElementOf(dcoll)
        .isEmpty)
  }

}

class CollectionInstancesStreamSpec
    extends CollectingInstancesProperties[Stream]
    with StreamSpec

class CollectionInstancesRDDSpec
    extends CollectingInstancesProperties[RDD]
    with RDDSpec 
Example 97
Source File: RDDPairFunctions.scala    From kontextfrei   with Apache License 2.0 5 votes vote down vote up
package com.danielwestheide.kontextfrei.rdd

import com.danielwestheide.kontextfrei.DCollectionPairFunctions
import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.reflect.ClassTag

private[kontextfrei] trait RDDPairFunctions
    extends DCollectionPairFunctions[RDD] { this: RDDBase =>

  override final def cogroup[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Iterable[B], Iterable[C]))] = withSite(x) {
    _.cogroup(y)
  }

  override final def values[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[B] = withSite(x) {
    _.values
  }

  override final def keys[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[A] = withSite(x) {
    _.keys
  }

  override final def leftOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (B, Option[C]))] = withSite(x) {
    _.leftOuterJoin(y)
  }

  override final def rightOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], C))] = withSite(x) {
    _.rightOuterJoin(y)
  }

  override final def fullOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], Option[C]))] = withSite(x) {
    _.fullOuterJoin(y)
  }

  override final def mapValues[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(f: B => C): RDD[(A, C)] = withSite(x) {
    _.mapValues(f)
  }

  override final def flatMapValues[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(f: B => TraversableOnce[C]): RDD[(A, C)] = withSite(x) {
    _.flatMapValues(f)
  }

  override final def reduceByKey[A: ClassTag, B: ClassTag](xs: RDD[(A, B)])(
      f: (B, B) => B): RDD[(A, B)] = withSite(xs) {
    _.reduceByKey(f)
  }

  override final def foldByKey[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)])(zeroValue: B, f: (B, B) => B): RDD[(A, B)] = withSite(xs) {
    _.foldByKey(zeroValue)(f)
  }

  override final def aggregateByKey[A: ClassTag, B: ClassTag, C: ClassTag](
      xs: RDD[(A, B)])(zeroValue: C)(seqOp: (C, B) => C,
                                     combOp: (C, C) => C): RDD[(A, C)] = withSite(xs) {
    _.aggregateByKey(zeroValue)(seqOp, combOp)
  }

  override final def combineByKey[A: ClassTag, B: ClassTag, C: ClassTag](
      xs: RDD[(A, B)])(createCombiner: B => C)(
      mergeValue: (C, B) => C,
      mergeCombiners: (C, C) => C): RDD[(A, C)] = withSite(xs) {
    _.combineByKey(createCombiner, mergeValue, mergeCombiners)
  }

  override final def countByKey[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)]): Map[A, Long] = withSite(xs) {
    _.countByKey()
  }

  override final def collectAsMap[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)]): Map[A, B] = withSite(xs) {
    _.collectAsMap()
  }

  override final def partitionBy[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)])(partitioner: Partitioner): RDD[(A, B)] = withSite(xs) {
    _.partitionBy(partitioner)
  }
} 
Example 98
Source File: RDDOrderedFunctions.scala    From kontextfrei   with Apache License 2.0 5 votes vote down vote up
package com.danielwestheide.kontextfrei.rdd
import com.danielwestheide.kontextfrei.DCollectionOrderedFunctions
import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

private[kontextfrei] trait RDDOrderedFunctions
    extends DCollectionOrderedFunctions[RDD] { this: RDDBase =>

  override final def sortByKey[A: ClassTag: Ordering, B: ClassTag](
      x: RDD[(A, B)])(ascending: Boolean): RDD[(A, B)] = withSite(x) {
    _.sortByKey(ascending)
  }

  override final def sortByKeyWithNumPartitions[A: ClassTag: Ordering,
                                                B: ClassTag](
      x: RDD[(A, B)])(ascending: Boolean, numPartitions: Int): RDD[(A, B)] = withSite(x) {
    _.sortByKey(ascending, numPartitions)
  }

  override final def filterByRange[A: ClassTag: Ordering, B: ClassTag](
      x: RDD[(A, B)])(lower: A, upper: A): RDD[(A, B)] = withSite(x) {
    _.filterByRange(lower, upper)
  }

  override def repartitionAndSortWithinPartitions[
      A: ClassTag: Ordering,
      B: ClassTag](
      x: RDD[(A, B)])(
      partitioner: Partitioner)
    : RDD[(A, B)] = withSite(x) {
    _.repartitionAndSortWithinPartitions(partitioner)
  }
} 
Example 99
Source File: RDDCollectionOpsSpec.scala    From kontextfrei   with Apache License 2.0 5 votes vote down vote up
package com.danielwestheide.kontextfrei

import com.danielwestheide.kontextfrei.rdd.RDDOpsSupport
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.scalatest.BeforeAndAfterAll

class RDDCollectionOpsSpec
    extends DCollectionOpsProperties[RDD]
    with BeforeAndAfterAll {
  implicit val sparkContext = new SparkContext("local[2]", "dcollection-spec")
  override implicit val ops: DCollectionOps[RDD] =
    RDDOpsSupport.rddCollectionOps
  override protected def afterAll(): Unit = {
    sparkContext.stop()
  }
} 
Example 100
Source File: TSNEHelper.scala    From spark-tsne   with Apache License 2.0 5 votes vote down vote up
package com.github.saurfang.spark.tsne

import breeze.linalg._
import breeze.stats._
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.rdd.RDD

object TSNEHelper {
  // p_ij = (p_{i|j} + p_{j|i}) / 2n
  def computeP(p_ji: CoordinateMatrix, n: Int): RDD[(Int, Iterable[(Int, Double)])] = {
    p_ji.entries
      .flatMap(e => Seq(
      ((e.i.toInt, e.j.toInt), e.value),
      ((e.j.toInt, e.i.toInt), e.value)
    ))
      .reduceByKey(_ + _) // p + p'
      .map{case ((i, j), v) => (i, (j, math.max(v / 2 / n, 1e-12))) } // p / 2n
      .groupByKey()
  }

  
  def update(Y: DenseMatrix[Double],
             dY: DenseMatrix[Double],
             iY: DenseMatrix[Double],
             gains: DenseMatrix[Double],
             iteration: Int,
             param: TSNEParam): DenseMatrix[Double] = {
    import param._
    val momentum = if (iteration <= t_momentum) initial_momentum else final_momentum
    gains.foreachPair {
      case ((i, j), old_gain) =>
        val new_gain = math.max(min_gain,
          if ((dY(i, j) > 0.0) != (iY(i, j) > 0.0))
            old_gain + 0.2
          else
            old_gain * 0.8
        )
        gains.update(i, j, new_gain)

        val new_iY = momentum * iY(i, j) - eta * new_gain * dY(i, j)
        iY.update(i, j, new_iY)

        Y.update(i, j, Y(i, j) + new_iY) // Y += iY
    }
    val t_Y: DenseVector[Double] = mean(Y(::, *)).t
    val y_sub = Y(*, ::)
    Y := y_sub - t_Y
  }
} 
Example 101
Source File: LocalRunner.scala    From spark-betweenness   with Apache License 2.0 5 votes vote down vote up
package com.centrality.kBC

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD

object MainRunner 
{
  def main(args: Array[String])
  {
    // Create spark context
    val appName="kBC"
    val sparkMode="local"
    val conf = new SparkConf().setAppName(appName).setMaster(sparkMode);
    val sc = new SparkContext(conf);
    
    // Create sample graph
    //
    // Create an RDD for vertices
    val users: RDD[(VertexId, (String, String))] =
    sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                         (5L, ("franklin", "prof")), (2L, ("istoica", "prof"))))
    // Create an RDD for edges
    val relationships: RDD[Edge[String]] =
      sc.parallelize(Array(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
                           Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi")))
    // Define a default user in case there are relationship with missing user
    val defaultUser = ("John Doe", "Missing")
    // Build the initial Graph
    val graph = Graph(users, relationships, defaultUser)
    
    val kBCGraph = 
      KBetweenness.run(graph, 3)
  }
} 
Example 102
Source File: TiRDD.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.tispark

import com.pingcap.tikv._
import com.pingcap.tikv.exception.TiInternalException
import com.pingcap.tikv.meta.TiDAGRequest
import com.pingcap.tikv.types.Converter
import com.pingcap.tikv.util.RangeSplitter
import com.pingcap.tikv.util.RangeSplitter.RegionTask
import com.pingcap.tispark.{TiPartition, TiTableReference}
import org.apache.spark.Partition
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow

import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.collection.mutable.ListBuffer

abstract class TiRDD(
    val dagRequest: TiDAGRequest,
    val physicalId: Long,
    val tiConf: TiConfiguration,
    val tableRef: TiTableReference,
    @transient private val session: TiSession,
    @transient private val sparkSession: SparkSession)
    extends RDD[InternalRow](sparkSession.sparkContext, Nil) {

  private lazy val partitionPerSplit = tiConf.getPartitionPerSplit

  protected def checkTimezone(): Unit = {
    if (!tiConf.getLocalTimeZone.equals(Converter.getLocalTimezone)) {
      throw new TiInternalException(
        "timezone are different! driver: " + tiConf.getLocalTimeZone + " executor:" + Converter.getLocalTimezone +
          " please set user.timezone in spark.driver.extraJavaOptions and spark.executor.extraJavaOptions")
    }
  }

  override protected def getPartitions: Array[Partition] = {
    val keyWithRegionTasks = RangeSplitter
      .newSplitter(session.getRegionManager)
      .splitRangeByRegion(dagRequest.getRangesByPhysicalId(physicalId), dagRequest.getStoreType)

    val hostTasksMap = new mutable.HashMap[String, mutable.Set[RegionTask]]
      with mutable.MultiMap[String, RegionTask]

    var index = 0
    val result = new ListBuffer[TiPartition]
    for (task <- keyWithRegionTasks) {
      hostTasksMap.addBinding(task.getHost, task)
      val tasks = hostTasksMap(task.getHost)
      if (tasks.size >= partitionPerSplit) {
        result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId))
        index += 1
        hostTasksMap.remove(task.getHost)
      }

    }
    // add rest
    for (tasks <- hostTasksMap.values) {
      result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId))
      index += 1
    }
    result.toArray
  }

  override protected def getPreferredLocations(split: Partition): Seq[String] =
    split.asInstanceOf[TiPartition].tasks.head.getHost :: Nil
} 
Example 103
Source File: BasicDataSourceSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.datasource

import com.pingcap.tikv.exception.TiBatchWriteException
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class BasicDataSourceSuite extends BaseDataSourceTest("test_datasource_basic") {
  private val row1 = Row(null, "Hello")
  private val row2 = Row(2, "TiDB")
  private val row3 = Row(3, "Spark")
  private val row4 = Row(4, null)

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  override def beforeAll(): Unit = {
    super.beforeAll()

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(null, 'Hello'), (2, 'TiDB')")
  }

  test("Test Select") {
    if (!supportBatchWrite) {
      cancel
    }

    testTiDBSelect(Seq(row1, row2))
  }

  test("Test Write Append") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

    df.write
      .format("tidb")
      .options(tidbOptions)
      .option("database", database)
      .option("table", table)
      .mode("append")
      .save()

    testTiDBSelect(Seq(row1, row2, row3, row4))
  }

  test("Test Write Overwrite") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

    val caught = intercept[TiBatchWriteException] {
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("database", database)
        .option("table", table)
        .mode("overwrite")
        .save()
    }

    assert(
      caught.getMessage
        .equals("SaveMode: Overwrite is not supported. TiSpark only support SaveMode.Append."))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
} 
Example 104
Source File: UpperCaseColumnNameSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class UpperCaseColumnNameSuite
    extends BaseDataSourceTest("test_datasource_uppser_case_column_name") {

  private val row1 = Row(1, 2)

  private val schema = StructType(
    List(StructField("O_ORDERKEY", IntegerType), StructField("O_CUSTKEY", IntegerType)))

  override def beforeAll(): Unit = {
    super.beforeAll()

    dropTable()
    jdbcUpdate(s"""
                  |CREATE TABLE $dbtable (O_ORDERKEY INTEGER NOT NULL,
                  |                       O_CUSTKEY INTEGER NOT NULL);
       """.stripMargin)
  }

  test("Test insert upper case column name") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row1))
    val df = sqlContext.createDataFrame(data, schema)
    df.write
      .format("tidb")
      .options(tidbOptions)
      .option("database", database)
      .option("table", table)
      .mode("append")
      .save()
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
} 
Example 105
Source File: MissingParameterSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class MissingParameterSuite extends BaseDataSourceTest("test_datasource_missing_parameter") {
  private val row1 = Row(null, "Hello")

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  test("Missing parameter: database") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")

    val caught = intercept[IllegalArgumentException] {
      val rows = row1 :: Nil
      val data: RDD[Row] = sc.makeRDD(rows)
      val df = sqlContext.createDataFrame(data, schema)
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("table", table)
        .mode("append")
        .save()
    }
    assert(
      caught.getMessage
        .equals("requirement failed: Option 'database' is required."))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
} 
Example 106
Source File: OnlyOnePkSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class OnlyOnePkSuite extends BaseDataSourceTest("test_datasource_only_one_pk") {
  private val row3 = Row(3)
  private val row4 = Row(4)

  private val schema = StructType(List(StructField("i", IntegerType)))

  override def beforeAll(): Unit = {
    super.beforeAll()

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int primary key)")
  }

  test("Test Write Append") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

    df.write
      .format("tidb")
      .options(tidbOptions)
      .option("database", database)
      .option("table", table)
      .mode("append")
      .save()

    testTiDBSelect(Seq(row3, row4))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
} 
Example 107
Source File: WriteDDLConflictSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.concurrency

import com.pingcap.tikv.exception.TiBatchWriteException
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

class WriteDDLConflictSuite extends ConcurrencyTest {
  test("write ddl conflict using TableLock") {
    if (!supportBatchWrite) {
      cancel
    }

    if (!isEnableTableLock) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    doBatchWriteInBackground(Map("useTableLock" -> "true"))

    Thread.sleep(sleepBeforeQuery)

    val caught = intercept[java.sql.SQLException] {
      jdbcUpdate(s"alter table $dbtable ADD Email varchar(255)")
    }
    assert(
      caught.getMessage
        .startsWith("Table 'test_concurrency_write_read' was locked in WRITE LOCAL by server"))
  }

  test("write ddl conflict using SchemaVersionCheck") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    new Thread(new Runnable {
      override def run(): Unit = {
        Thread.sleep(sleepBeforeQuery)
        jdbcUpdate(s"alter table $dbtable ADD Email varchar(255)")
      }
    }).start()

    val caught = intercept[TiBatchWriteException] {
      val data: RDD[Row] = sc.makeRDD(List(row1, row2, row3))
      val df = sqlContext.createDataFrame(data, schema)
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("database", database)
        .option("table", table)
        .option("sleepAfterPrewriteSecondaryKey", sleepBeforeQuery * 2)
        .option("useTableLock", "false")
        .mode("append")
        .save()
    }

    assert(caught.getMessage.equals("schema has changed during prewrite!"))
  }
} 
Example 108
Source File: WriteDDLNotConflictSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.concurrency

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

class WriteDDLNotConflictSuite extends ConcurrencyTest {
  test("ddl after GetCommitTS: add column") {
    doTest(s"alter table $dbtable ADD Email varchar(255)")
  }

  test("ddl after GetCommitTS: delete column") {
    doTest(s"alter table $dbtable drop column s")
  }

  test("ddl after GetCommitTS: rename column") {
    doTest(s"alter table $dbtable CHANGE s s2 varchar(128)")
  }

  test("ddl after GetCommitTS: change column type") {
    doTest(s"alter table $dbtable CHANGE i i BIGINT")
  }

  private def doTest(ddl: String): Unit = {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    new Thread(new Runnable {
      override def run(): Unit = {
        Thread.sleep(sleepBeforeQuery)
        jdbcUpdate(ddl)
      }
    }).start()

    val data: RDD[Row] = sc.makeRDD(List(row1, row2, row3))
    val df = sqlContext.createDataFrame(data, schema)
    df.write
      .format("tidb")
      .options(tidbOptions)
      .option("database", database)
      .option("table", table)
      .option("sleepAfterGetCommitTS", sleepBeforeQuery * 2)
      .option("useTableLock", "false")
      .mode("append")
      .save()

    compareSelect()
  }
} 
Example 109
Source File: WriteWriteConflictSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.concurrency

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

class WriteWriteConflictSuite extends ConcurrencyTest {
  test("write write conflict using TableLock & jdbc") {
    if (!supportBatchWrite) {
      cancel
    }

    if (!isEnableTableLock) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    doBatchWriteInBackground(Map("useTableLock" -> "true"))

    Thread.sleep(sleepBeforeQuery)

    val caught = intercept[java.sql.SQLException] {
      jdbcUpdate(s"insert into $dbtable values(5, 'test')")
    }
    assert(
      caught.getMessage
        .startsWith("Table 'test_concurrency_write_read' was locked in WRITE LOCAL by server"))
  }

  test("write write conflict using TableLock & tispark") {
    if (!supportBatchWrite) {
      cancel
    }

    if (!isEnableTableLock) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(4, 'null')")

    doBatchWriteInBackground(Map("useTableLock" -> "true"))

    Thread.sleep(sleepBeforeQuery)

    val caught = intercept[java.sql.SQLException] {
      val data: RDD[Row] = sc.makeRDD(List(row5))
      val df = sqlContext.createDataFrame(data, schema)
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("database", database)
        .option("table", table)
        .option("useTableLock", "true")
        .mode("append")
        .save()
    }
    assert(
      caught.getMessage
        .startsWith("Table 'test_concurrency_write_read' was locked in WRITE LOCAL by server"))
  }
} 
Example 110
Source File: LockTimeoutSuite.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package com.pingcap.tispark.ttl

import com.pingcap.tikv.TTLManager
import com.pingcap.tikv.exception.GrpcException
import com.pingcap.tispark.datasource.BaseDataSourceTest
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class LockTimeoutSuite extends BaseDataSourceTest("test_lock_timeout") {
  private val row1 = Row(1, "Hello")

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  override def beforeAll(): Unit = {
    super.beforeAll()
    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
  }

  test("Test Lock TTL Timeout") {
    if (!supportTTLUpdate) {
      cancel
    }

    val seconds = 1000
    val sleep1 = TTLManager.MANAGED_LOCK_TTL + 10 * seconds
    val sleep2 = TTLManager.MANAGED_LOCK_TTL + 15 * seconds

    val data: RDD[Row] = sc.makeRDD(List(row1))
    val df = sqlContext.createDataFrame(data, schema)

    new Thread(new Runnable {
      override def run(): Unit = {
        Thread.sleep(sleep1)
        queryTiDBViaJDBC(s"select * from $dbtable")
      }
    }).start()

    val grpcException = intercept[GrpcException] {
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("database", database)
        .option("table", table)
        .option("sleepAfterPrewritePrimaryKey", sleep2)
        .mode("append")
        .save()
    }

    assert(grpcException.getMessage.equals("retry is exhausted."))
    assert(grpcException.getCause.getMessage.startsWith("Txn commit primary key failed"))
    assert(
      grpcException.getCause.getCause.getMessage.startsWith(
        "Key exception occurred and the reason is retryable: \"Txn(Mvcc(TxnLockNotFound"))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
} 
Example 111
Source File: EmployeeRelationship.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples.graphx

import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.rdd.RDD
import org.apache.spark.graphx.{ Edge, Graph }


object EmployeeRelationship {
	def main(args: Array[String]): Unit = {
		// vertex format: vertex_id, data
		val vertexArray = Array(
			(1L, ("John", "Software Developer")),
			(2L, ("Robert", "Technical Leader")),
			(3L, ("Charlie", "Software Architect")),
			(4L, ("David", "Software Developer")),
			(5L, ("Edward", "Software Development Manager")),
			(6L, ("Francesca", "Software Development Manager")))

		// edge format: from_vertex_id, to_vertex_id, data
		val edgeArray = Array(
			Edge(2L, 1L, "Technical Mentor"),
			Edge(2L, 4L, "Technical Mentor"),
			Edge(3L, 2L, "Collaborator"),
			Edge(6L, 3L, "Team Member"),
			Edge(4L, 1L, "Peers"),
			Edge(5L, 2L, "Team Member"),
			Edge(5L, 3L, "Team Member"),
			Edge(5L, 6L, "Peers"))

		val sc = new SparkContext(new SparkConf().setAppName("EmployeeRelationshipJob"))

		val vertexRDD: RDD[(Long, (String, String))] = sc.parallelize(vertexArray)

		val edgeRDD: RDD[Edge[String]] = sc.parallelize(edgeArray)

		val graph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD)

		// Vanilla query
		println(">>> Showing the names of people who are Software Developers")
		graph.vertices.filter { case (id, (name, designation)) => designation.equals("Software Developer") }
			.collect()
			.foreach { case (id, (name, designation)) => println(s"... Name: $name, Designation: $designation") }

		// Connection analysis
		println(">>> People connected to Robert (Technical Leader) -> ")
		graph.triplets.filter(_.srcId == 2).collect()
			.foreach { item => println("... " + item.dstAttr._1 + ", " + item.dstAttr._2) }

		println(">>> Robert (Technical Leader) connected to -> ")
		graph.triplets.filter(_.dstId == 2).collect()
			.foreach { item => println("... " + item.srcAttr._1 + ", " + item.srcAttr._2) }

		println(">>> Technical Mentoring Analysis -> ")
		graph.triplets.filter(_.attr.equals("Technical Mentor")).collect()
			.foreach { item => println("... " + item.srcAttr._1 + " mentoring " + item.dstAttr._1) }
	}
} 
Example 112
Source File: PurchaseLogAnalysis.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples

import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.rdd.RDD


object PurchaseLogAnalysis {
	def main(args: Array[String]): Unit = {

		val ctx = new SparkContext(new SparkConf().setAppName("PurchaseAnalysisJob"))

		val badPkts = ctx.accumulator(0, "Bad Packets")
		val zeroValueSales = ctx.accumulator(0, "Zero Value Sales")
		val missingFields = ctx.accumulator(0, "Missing Fields")
		val blankLines = ctx.accumulator(0, "Blank Lines")

		ctx.textFile("file:/media/linux-1/spark-dev/data/purchases.log", 4)
			.foreach { line =>

				if (line.length() == 0) blankLines += 1
				else if (line.contains("Bad data packet")) badPkts += 1
				else {
					val fields = line.split("\t")

					if (fields.length != 4) missingFields += 1
					else if (fields(3).toFloat == 0) zeroValueSales += 1
				}
			}

		println("Purchase Log Analysis Counters:")
		println(s"\tBad Data Packets=${badPkts.value}")
		println(s"\tZero Value Sales=${zeroValueSales.value}")
		println(s"\tMissing Fields=${missingFields.value}")
		println(s"\tBlank Lines=${blankLines.value}")
	}
} 
Example 113
Source File: TestBroadcastVariables.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples

import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

import scala.io.Source
import scala.util.{ Try, Success, Failure }
import scala.collection.mutable.Map


	def loadCSVFile(filename: String): Option[Map[String, String]] = {
		val countries = Map[String, String]()

		Try {
			val bufferedSource = Source.fromFile(filename)

			for (line <- bufferedSource.getLines) {
				val Array(country, capital) = line.split(",").map(_.trim)
				countries += country -> capital
			}

			bufferedSource.close()
			return Some(countries)

		}.toOption
	}
} 
Example 114
Source File: TestAccumulators.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples

import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.rdd.RDD


		rdd.foreach { line =>
			if (line.length() > 0) totalLines += 1
			if (line.startsWith("error:")) errorLines += 1
			else if (line.startsWith("info:")) infoLines += 1
			else if (line.startsWith("warn:")) warnLines += 1
		}

		println(s">>> [Using Accumulators] Total: ${totalLines.value}, Error: ${errorLines.value}, Warnings: ${warnLines.value}, Info: ${infoLines.value}")
	}

	def usingRDDTransformations(sc: SparkContext, rdd: RDD[String]): Unit = {
		val errorLines = rdd.filter(_.startsWith("error:")).count()
		val infoLines = rdd.filter(_.startsWith("info:")).count()
		val warnLines = rdd.filter(_.startsWith("warn:")).count()

		println(s">>> [Using RDD Transformations] Error: $errorLines, Warnings: $warnLines, Info: $infoLines")
	}
} 
Example 115
Source File: TestJoins.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples

import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner }
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import scala.Iterator



object TestJoins {
	def main(args: Array[String]): Unit = {
		val sc = new SparkContext(new SparkConf().setAppName("TestJoinJob"))

		val x = sc.parallelize(List((1, 2), (1, 3), (2, 3), (2, 4))).partitionBy(new HashPartitioner(2)).cache
		val y = sc.parallelize(List((2, 5), (2, 6))).partitionBy(new HashPartitioner(2)).cache

		inspectRDD(x)
		inspectRDD(y)

		println(">>> joining x with y")
		val joinRDD = x.join(y).cache
		joinRDD.collect().foreach(println)
		inspectRDD(joinRDD)

		println(">>> left outer join of x with y")
		val leftJoin = x.leftOuterJoin(y).cache
		leftJoin.collect().foreach(println)
		inspectRDD(leftJoin)

		println(">>> right outer join of x with y")
		val rightJoin = x.rightOuterJoin(y).cache
		rightJoin.collect().foreach(println)
		inspectRDD(rightJoin)
	}
	
	def inspectRDD[T](rdd: RDD[T]): Unit = {
		
		println(">>> Partition length...")
		rdd.mapPartitions(f => Iterator(f.length), true).foreach(println)
		
		println(">>> Partition data...")
		rdd.foreachPartition(f => f.foreach(println))
	}
} 
Example 116
Source File: RedisSourceRdd.scala    From spark-redis   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package org.apache.spark.sql.redis.stream

import com.redislabs.provider.redis.RedisConfig
import com.redislabs.provider.redis.util.ConnectionUtils.withConnection
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.redis.stream.RedisSourceTypes.StreamEntry
import org.apache.spark.{Partition, SparkContext, TaskContext}


class RedisSourceRdd(sc: SparkContext, redisConfig: RedisConfig,
                     offsetRanges: Seq[RedisSourceOffsetRange], autoAck: Boolean = true)
  extends RDD[StreamEntry](sc, Nil) {

  override def compute(split: Partition, context: TaskContext): Iterator[StreamEntry] = {
    val partition = split.asInstanceOf[RedisSourceRddPartition]
    val offsetRange = partition.offsetRange
    val streamReader = new RedisStreamReader(redisConfig)
    streamReader.unreadStreamEntries(offsetRange)
  }

  override protected def getPartitions: Array[Partition] = {
    offsetRanges.zipWithIndex.map { case (e, i) => RedisSourceRddPartition(i, e) }
      .toArray
  }
}

case class RedisSourceRddPartition(index: Int, offsetRange: RedisSourceOffsetRange)
  extends Partition 
Example 117
Source File: ManyValueBenchmarkSuite.scala    From spark-redis   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.redislabs.provider.redis.df.benchmark

import com.redislabs.provider.redis.env.RedisClusterEnv
import com.redislabs.provider.redis.util.Person
import org.apache.spark.rdd.RDD


trait ManyValueBenchmarkSuite extends DataframeBenchmarkSuite with RedisClusterEnv {

  private def num = 1000000

  override def suiteTags: String = s"${super.suiteTags}, Many:$num"

  override def rdd(): RDD[Person] = {
    val partitionsNum = 8
    val sectionLength = num / partitionsNum
    spark.sparkContext
      .parallelize(0 until partitionsNum, partitionsNum)
      .mapPartitions {
        _
          .flatMap { i =>
            val start = i * sectionLength
            val end = start + sectionLength + 1
            Stream.range(start, end)
          }
          .map { i =>
            Person(s"John-$i", 30, "60 Wall Street", 150.5)
          }
      }
  }
} 
Example 118
Source File: Dijkstra.scala    From graphx-algorithm   with GNU General Public License v2.0 5 votes vote down vote up
package org.apache.spark.graphx.iiot.shortestpath

import org.apache.spark.graphx.GraphLoaderPlus
import org.apache.spark._
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD


    if (args.length < 2) sys.error("Usage: inputFileName sourceId [outputFileDirectory]")

    val inputFile = args(0)
    val sourceId: VertexId = args(1).toInt

    val sc = new SparkContext(new SparkConf().setAppName("Dijkstra Algorithm"))

    val graph = GraphLoaderPlus.edgeListFile(sc, inputFile)

    // `mapEdges` sometimes may be needed such as
    // `g.mapEdges(e => (new scala.util.Random).nextInt(100))`
    val g = graph.mapVertices((id, _) =>
      if (id == sourceId) Array(0.0, id)
      else Array(Double.PositiveInfinity, id)
    )

    val sssp = g.pregel(Array(Double.PositiveInfinity, -1))(
      (id, dist, newDist) => {
        if (dist(0) < newDist(0)) dist
        else newDist
      },
      triplet => {
        if (triplet.srcAttr(0) + triplet.attr < triplet.dstAttr(0)) {
          Iterator((triplet.dstId, Array(triplet.srcAttr(0) + triplet.attr, triplet.srcId)))
        }
        else {
          Iterator.empty
        }
      },
      (a, b) => {
        if (a(0) < b(0)) a
        else b
      }
    )

    val format_sssp: RDD[String] = sssp.vertices.map(vertex =>
      "Vertex " + vertex._1 + ": distance is " + vertex._2(0) + ", previous node is Vertex " + vertex._2(1).toInt)
    format_sssp.collect.foreach(println(_))

    if (args.length > 2) {
      val outputFileDir = args(2)
      format_sssp.saveAsTextFile(outputFileDir)
    }
  }
} 
Example 119
Source File: ReplicatedVertexView.scala    From graphx-algorithm   with GNU General Public License v2.0 5 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD

import org.apache.spark.graphx._


  def updateVertices(updates: VertexRDD[VD]): ReplicatedVertexView[VD, ED] = {
    val shippedVerts = updates.shipVertexAttributes(hasSrcId, hasDstId)
      .setName("ReplicatedVertexView.updateVertices - shippedVerts %s %s (broadcast)".format(
        hasSrcId, hasDstId))
      .partitionBy(edges.partitioner.get)

    val newEdges = edges.withPartitionsRDD(edges.partitionsRDD.zipPartitions(shippedVerts) {
      (ePartIter, shippedVertsIter) => ePartIter.map {
        case (pid, edgePartition) =>
          (pid, edgePartition.updateVertices(shippedVertsIter.flatMap(_._2.iterator)))
      }
    })
    new ReplicatedVertexView(newEdges, hasSrcId, hasDstId)
  }
} 
Example 120
Source File: EdgeRDDImpl.scala    From graphx-algorithm   with GNU General Public License v2.0 5 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{OneToOneDependency, HashPartitioner, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx._

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

} 
Example 121
Source File: RoutingTablePartition.scala    From graphx-algorithm   with GNU General Public License v2.0 5 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.ClassTag

import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.util.collection.{BitSet, PrimitiveVector}

import org.apache.spark.graphx._
import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap

import org.apache.spark.graphx.impl.RoutingTablePartition.RoutingTableMessage

private[graphx]
object RoutingTablePartition {
  
  def foreachWithinEdgePartition
      (pid: PartitionID, includeSrc: Boolean, includeDst: Boolean)
      (f: VertexId => Unit) {
    val (vidsCandidate, srcVids, dstVids) = routingTable(pid)
    val size = vidsCandidate.length
    if (includeSrc && includeDst) {
      // Avoid checks for performance
      vidsCandidate.iterator.foreach(f)
    } else if (!includeSrc && !includeDst) {
      // Do nothing
    } else {
      val relevantVids = if (includeSrc) srcVids else dstVids
      relevantVids.iterator.foreach { i => f(vidsCandidate(i)) }
    }
  }
} 
Example 122
Source File: SparkBatchAdapter.scala    From eventuate   with Apache License 2.0 5 votes vote down vote up
package com.rbmhtechnology.eventuate.adapter.spark

import akka.actor.ActorSystem
import akka.serialization.SerializationExtension

import com.datastax.spark.connector._
import com.datastax.spark.connector.types._
import com.rbmhtechnology.eventuate.DurableEvent
import com.rbmhtechnology.eventuate.log.cassandra.CassandraEventLogSettings
import com.typesafe.config._

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


  def eventBatch(logId: String, fromSequenceNr: Long = 1L): RDD[DurableEvent] = {
    context.cassandraTable(cassandraSettings.keyspace, s"${cassandraSettings.tablePrefix}_$logId")
      .select("event").where(s"sequence_nr >= $fromSequenceNr").as((event: DurableEvent) => event)
  }
}

private class DurableEventConverter(config: Config) extends TypeConverter[DurableEvent] {
  import scala.reflect.runtime.universe._

  val converter = implicitly[TypeConverter[Array[Byte]]]

  // --------------------------------------
  //  FIXME: how to shutdown actor system?
  // --------------------------------------

  @transient lazy val system = ActorSystem("TypeConverter", config)
  @transient lazy val serial = SerializationExtension(system)

  def targetTypeTag = implicitly[TypeTag[DurableEvent]]
  def convertPF = {
    case obj => deserialize(converter.convert(obj))
  }

  def deserialize(bytes: Array[Byte]): DurableEvent =
    serial.deserialize(bytes, classOf[DurableEvent]).get
} 
Example 123
Source File: GenerateVerticesExample.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch08

// scalastyle:off println
import org.apache.log4j.{Level, Logger}

import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.rdd.RDD


object GenerateVerticesExample {

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    // ログレベルをWARNに設定
    Logger.getLogger("org").setLevel(Level.WARN)

    // SparkContextの生成
    val conf = new SparkConf().setAppName("GenerateVerticesExample")
    val sc = new SparkContext(conf)

    // 引数から設定値を取得
    val (numProducts, numUsers): (Int, Int) = (args(0).toInt, args(1).toInt)
    implicit val recOpts: RecommendLogOptions = RecommendLogOptions(numProducts, numUsers)

    run(sc)
    sc.stop()
  }

  def run(sc: SparkContext)
         (implicit recOpts: RecommendLogOptions)
  : Unit = {

    // 商品リスト、ユーザリストのRDDを生成
    val products: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genProductList)
    val users: RDD[VertexProperty] = sc.parallelize(PurchaseLogGenerator.genUserList)

    // 商品リスト20件を表示
    println("===================================")
    println("get top 20 products:")
    products.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}"))

    // ユーザリスト20件を表示
    println("===================================")
    println("get top 20 users:")
    users.take(20).foreach(x => println(s"id: ${x.id},\ttype: ${x.kind},\tname: ${x.name}"))

  }
}
// scalastyle:on println 
Example 124
Source File: gihyo_6_3_Transform.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{StreamingContext, Seconds}
import org.apache.spark.streaming.dstream.InputDStream

object gihyo_6_3_Transform {
  def main(args: Array[String]) {
    if (args.length != 2) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val targetHost = args(0)
    val targetHostPort = args(1).toInt

    val conf = new SparkConf().setAppName("NetworkWordCount")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(5))
    val lines = ssc.socketTextStream(targetHost, targetHostPort)
    val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment")))
    run(lines, blackList)

    ssc.start
    ssc.awaitTermination
  }

  def run(stream: InputDStream[String], blackList: RDD[(String, String)]) {
    val userList = stream.map(x => (x, "action:Login")).transform(rdd => {
      val tmpUserList = rdd.leftOuterJoin(blackList)
      tmpUserList.filter(user => (user._2._2 == None))
    })
    userList.print
  }
} 
Example 125
Source File: gihyo_6_3_JoinSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}
import scala.collection.mutable
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_JoinSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines1 = mutable.Queue[RDD[String]]()
    val ds1 = ssc.queueStream(lines1)
    val lines2 = mutable.Queue[RDD[String]]()
    val ds2 = ssc.queueStream(lines2)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Join.run(ds1, ds2)
    ssc.start()

    lines1 += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
    lines2 += sc.makeRDD(Seq("key2", "key3", "key4")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
} 
Example 126
Source File: gihyo_6_3_CountByValueAndWindowSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}
import scala.collection.mutable
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper
import java.nio.file.Files

class gihyo_6_3_CountByValueAndWindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir)
    gihyo_6_3_countByValueAndWindow.run(ds, 2, 1)
    ssc.start()
    (1 to 3).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
} 
Example 127
Source File: gihyo_6_3_MapSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_MapSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Map.run(ds)
    ssc.start()
    lines += sc.makeRDD(Seq("key1", "key2", "key3", "key1")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
} 
Example 128
Source File: gihyo_6_3_TwitterStreamSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import java.nio.file.Files

import scala.collection.mutable
import scala.io.Source

import twitter4j.{Status, TwitterObjectFactory}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}


class gihyo_6_3_TwitterStreamSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[Status]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_TwitterStream.run(
      sc,
      ds,
      Files.createTempDirectory("TwitterTag").toString,
      Files.createTempDirectory("TwitterWords").toString)
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir)
    ssc.start()

    (1 to 2).foreach { case i =>
      // test data
      lines += sc.makeRDD(Seq(
        MockTweetGenerator.createMockStatusFromJson(),
        MockTweetGenerator.createMockStatusFromJson(),
        MockTweetGenerator.createMockStatusFromJson(),
        MockTweetGenerator.createMockStatusFromJson()))
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
}

object MockTweetGenerator {
  // Creates a tweet status from a JSON file
  def createMockStatusFromJson(): Status = {
    val jsonFile = getClass.getResource("/streaming/test-tweet.json").getPath
    TwitterObjectFactory.createStatus(Source.fromFile(jsonFile).getLines().mkString)
  }
} 
Example 129
Source File: gihyo_6_3_FilterSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_FilterSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Filter.run(ds)
    ssc.start()
    lines += sc.makeRDD(Seq("lengthOver5", "les1", "les2")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
} 
Example 130
Source File: gihyo_6_3_FlatMapSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_FlatMapSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_flatMap.run(ds)
    ssc.start()
    // test data
    lines += sc.makeRDD(Seq("Apache Spark is a fast and general-purpose cluster computing system."))
    clock.advance(1000)
    Thread.sleep(1000)
  }
} 
Example 131
Source File: gihyo_6_3_CountSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_CountSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Count.run(ds, 2, 1)
    ssc.start()
    (1 to 2).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
} 
Example 132
Source File: gihyo_6_3_UnionSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_UnionSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = (1 to 3).map(x => mutable.Queue[RDD[(String, String)]]())
    val dss = lines.map(x => ssc.queueStream(x))
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Union.run(ssc, dss)
    ssc.start()
    lines.map(x => x += sc.makeRDD(Seq(("", "key1"), ("", "key2"), ("", "key3")))) //test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
} 
Example 133
Source File: gihyo_6_3_ReduceByKeyAndWindowSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceByKeyAndWindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_reduceByKeyAndWindow.run(ds, 2, 1)
    ssc.start()
    (1 to 3).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
} 
Example 134
Source File: gihyo_6_3_ReduceByKeySuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceByKeySuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_reduceByKey.run(ds)
    ssc.start()
    lines += sc.makeRDD(Seq("key1", "key2", "key3", "key1")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
} 
Example 135
Source File: gihyo_6_3_CountByWindowSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import java.nio.file.Files

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_CountByWindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir)
    gihyo_6_3_countByWindow.run(ds, 2, 1)
    ssc.start()
    (1 to 3).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
} 
Example 136
Source File: gihyo_6_3_UpdateStateByKeySuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import java.nio.file.Files

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_UpdateStateByKeySuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_updateStateByKey.run(ds)
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir)
    ssc.start()
    lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
} 
Example 137
Source File: gihyo_6_3_RepartitionSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_RepartitionSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Repartition.run(ds)
    ssc.start()
    lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
} 
Example 138
package jp.gihyo.spark.ch06

import java.nio.file.Files

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceByKeyAndWindowEfficientSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_reduceByKeyAndWindow_efficient.run(ds, 2, 1)
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir)
    ssc.start()
    (1 to 2).foreach { case i =>
      lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
} 
Example 139
Source File: gihyo_6_3_KafkaStreamSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable
import java.nio.file.Files

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_KafkaStreamSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[(String, String)]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_KafkaStream.run(ds, Files.createTempDirectory("KafkaStreamSuite").toString, 2, 1)
    val checkpointDir = Files.createTempDirectory("StreamingUnitTest").toString
    ssc.checkpoint(checkpointDir)
    ssc.start()
    (1 to 2).foreach { case i =>
      lines += sc.makeRDD(Seq(("", "userid:userid001,action:view,pageid:value1"),
        ("", "userid:userid002,action:click,pageid:value2"),
        ("", "userid:userid003,action:view,pageid:value3"),
        ("", "userid:userid001,action:view,pageid:value4"))) // test data
      clock.advance(1000)
      Thread.sleep(1000)
    }
  }
} 
Example 140
Source File: gihyo_6_3_WiindowSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}
import scala.collection.mutable
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_WindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Window.run(ds, 2, 1)
    ssc.start()
    (1 to 3).foreach {
      case i => {
        lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
        clock.advance(1000)
        Thread.sleep(1000)
      }
    }
  }
} 
Example 141
Source File: gihyo_6_3_CogroupSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_CogroupSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val lines2 = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val ds2 = ssc.queueStream(lines2)
    val clock = new StreamingContextWrapper(ssc).manualClock
    lines += sc.makeRDD(Seq("key1", "key2", "key3")) // test data
    lines2 += sc.makeRDD(Seq("key2", "key3", "key4")) // test data
    gihyo_6_3_Cogroup.run(ds, ds2)
    ssc.start()
    clock.advance(1000)
    Thread.sleep(1000)
  }
} 
Example 142
Source File: gihyo_6_2_1_SampleSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_2_1_SampleSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    lines += sc.makeRDD(Seq("word1 word2", "word3 word1", "word4 word2")) // test data
    gihyo_6_2_1_Sample.run(ds)
    ssc.start()
    clock.advance(1000)
    Thread.sleep(1000)
  }
} 
Example 143
Source File: gihyo_6_3_TransformSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_TransformSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    val blackList = sc.parallelize(Array(("user002", "rockLogin"), ("user003", "rockPayment")))
    gihyo_6_3_Transform.run(ds, blackList)
    ssc.start()
    lines += sc.makeRDD(Seq("user001", "user002", "user003")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
} 
Example 144
Source File: gihyo_6_3_CountByValueSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_CountByValueSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_countByValue.run(ds)
    ssc.start()
    lines += sc.makeRDD(Seq("key1", "key2", "key3", "key1")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
} 
Example 145
Source File: gihyo_6_3_ReduceSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_Reduce.run(ds)
    ssc.start()
    lines += sc.makeRDD(Seq("gi", "jutsu", "hyoron", "sha")) // test data
    clock.advance(1000)
    Thread.sleep(1000)
  }
} 
Example 146
Source File: gihyo_6_3_ReduceByWindowSuite.scala    From gihyo-spark-book-example   with Apache License 2.0 5 votes vote down vote up
package jp.gihyo.spark.ch06

import scala.collection.mutable

import jp.gihyo.spark.{SparkFunSuite, TestStreamingContext}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.StreamingContextWrapper

class gihyo_6_3_ReduceByWindowSuite extends SparkFunSuite with TestStreamingContext {

  test("run") {
    val lines = mutable.Queue[RDD[String]]()
    val ds = ssc.queueStream(lines)
    val clock = new StreamingContextWrapper(ssc).manualClock
    gihyo_6_3_reduceByWindow.run(ds, 2, 1)
    ssc.start()
    (1 to 2).foreach {
      case i => {
        lines += sc.makeRDD(Seq("gi", "jutsu", "hyoron", "sha")) // test data
        clock.advance(1000)
        Thread.sleep(1000)
      }
    }
  }
} 
Example 147
Source File: FileReader.scala    From bdd-spark   with MIT License 5 votes vote down vote up
import org.apache.spark.rdd.RDD

trait FileReader {
  def readLinesToRdd(filename : String) : RDD[String]
  def readText(filename : String) : String
}

object FileReader {
  class RealFileReader extends FileReader{
    override def readLinesToRdd(filename: String): RDD[String] = {
      Spark.spark.sparkContext.textFile(filename)
    }

    override def readText(filename: String): String = {
      //Whatever!
      ""
    }
  }

  def apply() : FileReader = new RealFileReader
} 
Example 148
Source File: RecommendationModelReuse.scala    From Scala-Machine-Learning-Projects   with MIT License 5 votes vote down vote up
package com.packt.ScalaML.MovieRecommendation

import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import scala.Tuple2
import org.apache.spark.rdd.RDD

object RecommendationModelReuse {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName("JavaLDAExample")
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/").
      getOrCreate()

    val ratigsFile = "data/ratings.csv"
    val ratingDF = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile)
    val selectedRatingsDF = ratingDF.select(ratingDF.col("userId"), ratingDF.col("movieId"), ratingDF.col("rating"), ratingDF.col("timestamp"))

    // Randomly split ratings RDD into training data RDD (75%) and test data RDD (25%)
    val splits = selectedRatingsDF.randomSplit(Array(0.75, 0.25), seed = 12345L)
    val testData = splits(1)

    val testRDD = testData.rdd.map(row => {
      val userId = row.getString(0)
      val movieId = row.getString(1)
      val ratings = row.getString(2)
      Rating(userId.toInt, movieId.toInt, ratings.toDouble)
    })

    //Load the workflow back
    val same_model = MatrixFactorizationModel.load(spark.sparkContext, "model/MovieRecomModel/")

    // Making Predictions. Get the top 6 movie predictions for user 668
    println("Rating:(UserID, MovieID, Rating)")
    println("----------------------------------")
    val topRecsForUser = same_model.recommendProducts(458, 10)
    for (rating <- topRecsForUser) {
      println(rating.toString())
    }
    println("----------------------------------")

    val rmseTest = MovieRecommendation.computeRmse(same_model, testRDD, true)
    println("Test RMSE: = " + rmseTest) //Less is better

    //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668
    println("Recommendations: (MovieId => Rating)")
    println("----------------------------------")
    val recommendationsUser = same_model.recommendProducts(458, 10)
    recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println)
    println("----------------------------------")

    spark.stop()
  }
} 
Example 149
Source File: MovieRecommendation.scala    From Scala-Machine-Learning-Projects   with MIT License 5 votes vote down vote up
package com.packt.ScalaML.MovieRecommendation

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SQLImplicits
import org.apache.spark.sql._
import org.apache.spark.sql.Dataset
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import scala.Tuple2
import org.apache.spark.rdd.RDD

object MovieRecommendation {  
  //Compute the RMSE to evaluate the model. Less the RMSE better the model and it's prediction capability. 
  def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean): Double = {
    val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product)))
    val predictionsAndRatings = predictions.map { x => ((x.user, x.product), x.rating)
    }.join(data.map(x => ((x.user, x.product), x.rating))).values
    if (implicitPrefs) {
      println("(Prediction, Rating)")
      println(predictionsAndRatings.take(5).mkString("\n"))
    }
    math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean())
  }

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName("JavaLDAExample")
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/").
      getOrCreate()

    val ratigsFile = "data/ratings.csv"
    val df1 = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile)

    val ratingsDF = df1.select(df1.col("userId"), df1.col("movieId"), df1.col("rating"), df1.col("timestamp"))
    ratingsDF.show(false)

    val moviesFile = "data/movies.csv"
    val df2 = spark.read.format("com.databricks.spark.csv").option("header", "true").load(moviesFile)

    val moviesDF = df2.select(df2.col("movieId"), df2.col("title"), df2.col("genres"))
    moviesDF.show(false)

    ratingsDF.createOrReplaceTempView("ratings")
    moviesDF.createOrReplaceTempView("movies")

    

    var rmseTest = computeRmse(model, testRDD, true)
    println("Test RMSE: = " + rmseTest) //Less is better

    //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668
    println("Recommendations: (MovieId => Rating)")
    println("----------------------------------")
    val recommendationsUser = model.recommendProducts(668, 6)
    recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println)
    println("----------------------------------")

    spark.stop()
  }
} 
Example 150
Source File: HbRddWriter.scala    From hbrdd   with Apache License 2.0 5 votes vote down vote up
package top.spoofer.hbrdd.hbsupport

import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.spark.rdd.RDD
import top.spoofer.hbrdd.config.HbRddConfig
import top.spoofer.hbrdd.unit.HbRddFormatsWriter
import top.spoofer.hbrdd._
import HbRddWritPuter._

trait HbRddWriter {
  type TsValue[A] = (Long, A) // (ts, A)
  val LATEST_TIMESTAMP = Long.MaxValue
  
final class SingleFamilyRDDWriter[A](
    val rdd: RDD[(String, Map[String, A])],
    val put: HbRddPuter[A]
) extends HbRddWritCommon[A] with Serializable {
  def put2Hbase(tableName: String, family: String)(implicit config: HbRddConfig) = {
    val job = createJob(tableName, config.getHbaseConfig)
    rdd.flatMap({ case (rowId, data) => convert2Writable(rowId, Map(family -> data), put) })
      .saveAsNewAPIHadoopDataset(job.getConfiguration)
  }
} 
Example 151
Source File: XmlReader.scala    From spark-xml   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.xml

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SQLContext, SparkSession}
import org.apache.spark.sql.types.StructType
import com.databricks.spark.xml.util.XmlFile
import com.databricks.spark.xml.util.FailFastMode


  @deprecated("Use xmlFile(SparkSession, ...)", "0.5.0")
  def xmlFile(sqlContext: SQLContext, path: String): DataFrame = {
    // We need the `charset` and `rowTag` before creating the relation.
    val (charset, rowTag) = {
      val options = XmlOptions(parameters.toMap)
      (options.charset, options.rowTag)
    }
    val relation = XmlRelation(
      () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag),
      Some(path),
      parameters.toMap,
      schema)(sqlContext)
    sqlContext.baseRelationToDataFrame(relation)
  }

  @deprecated("Use xmlRdd(SparkSession, ...)", "0.5.0")
  def xmlRdd(sqlContext: SQLContext, xmlRDD: RDD[String]): DataFrame = {
    val relation = XmlRelation(
      () => xmlRDD,
      None,
      parameters.toMap,
      schema)(sqlContext)
    sqlContext.baseRelationToDataFrame(relation)
  }

} 
Example 152
Source File: XmlFile.scala    From spark-xml   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.xml.util

import java.io.CharArrayWriter
import java.nio.charset.Charset
import javax.xml.stream.XMLOutputFactory

import scala.collection.Map

import com.databricks.spark.xml.parsers.StaxXmlGenerator
import com.sun.xml.txw2.output.IndentingXMLStreamWriter
import org.apache.hadoop.io.{Text, LongWritable}

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.sql.DataFrame
import com.databricks.spark.xml.{XmlOptions, XmlInputFormat}

private[xml] object XmlFile {
  val DEFAULT_INDENT = "    "

  def withCharset(
      context: SparkContext,
      location: String,
      charset: String,
      rowTag: String): RDD[String] = {
    // This just checks the charset's validity early, to keep behavior
    Charset.forName(charset)
    context.hadoopConfiguration.set(XmlInputFormat.START_TAG_KEY, s"<$rowTag>")
    context.hadoopConfiguration.set(XmlInputFormat.END_TAG_KEY, s"</$rowTag>")
    context.hadoopConfiguration.set(XmlInputFormat.ENCODING_KEY, charset)
    context.newAPIHadoopFile(location,
      classOf[XmlInputFormat],
      classOf[LongWritable],
      classOf[Text]).map { case (_, text) => new String(text.getBytes, 0, text.getLength, charset) }
  }

  
  def saveAsXmlFile(
      dataFrame: DataFrame,
      path: String,
      parameters: Map[String, String] = Map()): Unit = {
    val options = XmlOptions(parameters.toMap)
    val codecClass = CompressionCodecs.getCodecClass(options.codec)
    val rowSchema = dataFrame.schema
    val indent = XmlFile.DEFAULT_INDENT

    val xmlRDD = dataFrame.rdd.mapPartitions { iter =>
      val factory = XMLOutputFactory.newInstance()
      val writer = new CharArrayWriter()
      val xmlWriter = factory.createXMLStreamWriter(writer)
      val indentingXmlWriter = new IndentingXMLStreamWriter(xmlWriter)
      indentingXmlWriter.setIndentStep(indent)

      new Iterator[String] {
        var firstRow: Boolean = true
        var lastRow: Boolean = true

        override def hasNext: Boolean = iter.hasNext || firstRow || lastRow

        override def next: String = {
          if (iter.nonEmpty) {
            if (firstRow) {
              indentingXmlWriter.writeStartElement(options.rootTag)
              firstRow = false
            }
            val xml = {
              StaxXmlGenerator(
                rowSchema,
                indentingXmlWriter,
                options)(iter.next())
              indentingXmlWriter.flush()
              writer.toString
            }
            writer.reset()
            xml
          } else {
            if (!firstRow) {
              lastRow = false
              indentingXmlWriter.writeEndElement()
              indentingXmlWriter.close()
              writer.toString
            } else {
              // This means the iterator was initially empty.
              firstRow = false
              lastRow = false
              ""
            }
          }
        }
      }
    }

    codecClass match {
      case null => xmlRDD.saveAsTextFile(path)
      case codec => xmlRDD.saveAsTextFile(path, codec)
    }
  }
} 
Example 153
Source File: XmlRelation.scala    From spark-xml   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.xml

import java.io.IOException

import org.apache.hadoop.fs.Path

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.sources.{PrunedScan, InsertableRelation, BaseRelation, TableScan}
import org.apache.spark.sql.types._
import com.databricks.spark.xml.util.{InferSchema, XmlFile}
import com.databricks.spark.xml.parsers.StaxXmlParser

case class XmlRelation protected[spark] (
    baseRDD: () => RDD[String],
    location: Option[String],
    parameters: Map[String, String],
    userSchema: StructType = null)(@transient val sqlContext: SQLContext)
  extends BaseRelation
  with InsertableRelation
  with PrunedScan {

  private val options = XmlOptions(parameters)

  override val schema: StructType = {
    Option(userSchema).getOrElse {
      InferSchema.infer(
        baseRDD(),
        options)
    }
  }

  override def buildScan(requiredColumns: Array[String]): RDD[Row] = {
    val requiredFields = requiredColumns.map(schema(_))
    val requestedSchema = StructType(requiredFields)
    StaxXmlParser.parse(
      baseRDD(),
      requestedSchema,
      options)
  }

  // The function below was borrowed from JSONRelation
  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    val filesystemPath = location match {
      case Some(p) => new Path(p)
      case None =>
        throw new IOException(s"Cannot INSERT into table with no path defined")
    }

    val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)

    if (overwrite) {
      try {
        fs.delete(filesystemPath, true)
      } catch {
        case e: IOException =>
          throw new IOException(
            s"Unable to clear output directory ${filesystemPath.toString} prior"
              + s" to INSERT OVERWRITE a XML table:\n${e.toString}")
      }
      // Write the data. We assume that schema isn't changed, and we won't update it.
      XmlFile.saveAsXmlFile(data, filesystemPath.toString, parameters)
    } else {
      throw new IllegalArgumentException("XML tables only support INSERT OVERWRITE for now.")
    }
  }
} 
Example 154
Source File: SparkSuite.scala    From spark-sorted   with Apache License 2.0 5 votes vote down vote up
package com.tresata.spark.sorted

import org.scalactic.Equality
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{ Dataset, SparkSession }

object SparkSuite {
  lazy val spark: SparkSession = {
    val session = SparkSession.builder
      .master("local[*]")
      .appName("test")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.ui.enabled", false)
      .config("spark.sql.shuffle.partitions", 4)
      .getOrCreate()
    session
  }
  lazy val sc: SparkContext = spark.sparkContext

  lazy val jsc = new JavaSparkContext(sc)
  def javaSparkContext() = jsc
}

trait SparkSuite {
  implicit lazy val spark: SparkSession = SparkSuite.spark
  implicit lazy val sc: SparkContext = SparkSuite.spark.sparkContext

  implicit def rddEq[X]: Equality[RDD[X]] = new Equality[RDD[X]] {
    private def toCounts[Y](s: Seq[Y]): Map[Y, Int] = s.groupBy(identity).mapValues(_.size)

    def areEqual(a: RDD[X], b: Any): Boolean = b match {
      case s: Seq[_] => toCounts(a.collect) == toCounts(s)
      case rdd: RDD[_] => toCounts(a.collect) == toCounts(rdd.collect)
    }
  }

  implicit def gsEq[K, V](implicit rddEq: Equality[RDD[(K, V)]]): Equality[GroupSorted[K, V]] = new Equality[GroupSorted[K, V]] {
    def areEqual(a: GroupSorted[K, V], b: Any): Boolean = rddEq.areEqual(a, b)
  }
  
  implicit def dsEq[X](implicit rddEq: Equality[RDD[X]]): Equality[Dataset[X]] = new Equality[Dataset[X]] {
    def areEqual(a: Dataset[X], b: Any): Boolean = b match {
      case ds: Dataset[_] => rddEq.areEqual(a.rdd, ds.rdd)
      case x => rddEq.areEqual(a.rdd, x)
    }
  }
} 
Example 155
Source File: BinaryClassifierEvaluator.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.evaluation

import org.apache.spark.rdd.RDD


  def evaluate(predictions: RDD[Boolean], actuals: RDD[Boolean]): BinaryClassificationMetrics = {
    predictions.zip(actuals).map { case (pred, actual) =>
      val tp = if (pred && actual) 1d else 0d
      val fp = if (pred && !actual) 1d else 0d
      val tn = if (!pred && !actual) 1d else 0d
      val fn = if (!pred && actual) 1d else 0d
      BinaryClassificationMetrics(tp, fp, tn, fn)
    }.reduce(_ merge _)
  }
} 
Example 156
Source File: AugmentedExamplesEvaluator.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.evaluation

import breeze.linalg._
import keystoneml.nodes.util.MaxClassifier
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

object AggregationPolicyType extends Enumeration {
  type AggregationPolicyType = Value
  val average, borda = Value
}

class AugmentedExamplesEvaluator[T : ClassTag](
    names: RDD[T],
    numClasses: Int,
    policy: AggregationPolicyType.Value = AggregationPolicyType.average)
  extends Evaluator[DenseVector[Double], Int, MulticlassMetrics] with Serializable {

  def averagePolicy(preds: Array[DenseVector[Double]]): DenseVector[Double] = {
    preds.reduce(_ + _) :/ preds.size.toDouble
  }

  
  def bordaPolicy(preds: Array[DenseVector[Double]]): DenseVector[Double] = {
    val ranks = preds.map { vec =>
      val sortedPreds = vec.toArray.zipWithIndex.sortBy(_._1).map(_._2)
      val rank = DenseVector(sortedPreds.zipWithIndex.sortBy(_._1).map(x => x._2.toDouble))
      rank
    }
    ranks.reduceLeft(_ + _)
  }

  def evaluate(
      predicted: RDD[DenseVector[Double]],
      actualLabels: RDD[Int]): MulticlassMetrics = {

    val aggFunc = policy match {
      case AggregationPolicyType.borda => bordaPolicy _
      case _ => averagePolicy _
    }
       
    // associate a name with each predicted, actual
    val namedPreds = names.zip(predicted.zip(actualLabels))

    // group by name to get all the predicted values for a name
    val groupedPreds = namedPreds.groupByKey(names.partitions.length).map { case (group, iter) =>
      val predActuals = iter.toArray // this is a array of tuples
      val predsForName = predActuals.map(_._1)
      assert(predActuals.map(_._2).distinct.size == 1)
      val actualForName: Int = predActuals.map(_._2).head

      (predsForName, actualForName)
    }.cache()

    // Averaging policy
    val finalPred = groupedPreds.map(x => (aggFunc(x._1), x._2) )
    val finalPredictedLabels = MaxClassifier(finalPred.map(_._1))
    val finalActualLabels = finalPred.map(_._2)

    val ret = new MulticlassClassifierEvaluator(numClasses).evaluate(finalPredictedLabels, finalActualLabels)
    groupedPreds.unpersist()
    ret
  }
} 
Example 157
Source File: MeanAveragePrecisionEvaluator.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.evaluation

import breeze.linalg.DenseVector
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext._


  private def getAP(precisions: Array[Double], recalls: Array[Double]) = {
    var ap = 0.0
    val levels = (0 to 10).map(x => x / 10.0)
    levels.foreach { t =>
      // Find where recalls are greater than t and precision values at those indices
      val px = recalls.toSeq.zipWithIndex.filter(x => x._1 >= t).map(x => precisions(x._2))
      val p = if (px.isEmpty) {
        0.0
      } else {
        px.max
      }
      ap = ap + p / 11.0
    }
    ap
  }

} 
Example 158
Source File: Stats.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.utils

import java.util.{Random => JRandom}

import breeze.linalg._
import breeze.numerics._
import breeze.stats._
import breeze.stats.distributions._
import keystoneml.nodes.util.TopKClassifier
import org.apache.spark.rdd.RDD

object Stats extends Serializable {
  
  def normalizeRows(mat: DenseMatrix[Double], alpha: Double = 1.0): DenseMatrix[Double] = {
    // FIXME: This currently must convert the matrices to double due to breeze implicits
    // TODO: Could optimize, use way fewer copies
    val rowMeans: DenseVector[Double] = mean(mat(*, ::)).map(x => if (x.isNaN) 0 else x)
    val variances: DenseVector[Double] = sum((mat(::, *) - rowMeans) :^= 2.0, Axis._1) :/= (mat.cols.toDouble - 1.0)
    val sds: DenseVector[Double] = sqrt(variances + alpha.toDouble).map(x => if (x.isNaN) math.sqrt(alpha) else x)

    val out = mat(::, *) - rowMeans
    out(::, *) /= sds

    out
  }
} 
Example 159
Source File: GatherTransformerOperator.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.workflow

import org.apache.spark.rdd.RDD


private[workflow] case class GatherTransformerOperator[T]() extends TransformerOperator {
  override private[workflow] def singleTransform(inputs: Seq[DatumExpression]): Any = {
    inputs.map(_.get.asInstanceOf[T])
  }

  override private[workflow] def batchTransform(inputs: Seq[DatasetExpression]): RDD[_] = {
    inputs.map(_.get.asInstanceOf[RDD[T]].map(t => Seq(t))).reduceLeft((x, y) => {
      x.zip(y).map(z => z._1 ++ z._2)
    })
  }
} 
Example 160
Source File: PipelineDataset.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.workflow

import org.apache.spark.rdd.RDD


class PipelineDataset[T] private[workflow](executor: GraphExecutor, sink: SinkId)
  extends PipelineResult[RDD[T]](
    executor,
    sink)

object PipelineDataset {
  private[workflow] def apply[T](rdd: RDD[T]): PipelineDataset[T] = {
    val emptyGraph = Graph(Set(), Map(), Map(), Map())
    val (graphWithDataset, nodeId) = emptyGraph.addNode(new DatasetOperator(rdd), Seq())
    val (graph, sinkId) = graphWithDataset.addSink(nodeId)

    new PipelineDataset[T](new GraphExecutor(graph), sinkId)
  }
} 
Example 161
Source File: KernelMatrix.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import breeze.linalg._

import org.apache.spark.rdd.RDD

import keystoneml.utils.{MatrixUtils, Stats}
import keystoneml.workflow.{Transformer, LabelEstimator}


class BlockKernelMatrix[T: ClassTag](
    val kernelGen: KernelTransformer[T],
    val data: RDD[T],
    val cacheKernel: Boolean)
  extends KernelMatrix {

  val colBlockCache = HashMap.empty[Seq[Int], RDD[DenseMatrix[Double]]]
  val diagBlockCache = HashMap.empty[Seq[Int], DenseMatrix[Double]]

  def apply(colIdxs: Seq[Int]): RDD[DenseMatrix[Double]] = {
    if (colBlockCache.contains(colIdxs)) {
      colBlockCache(colIdxs)
    } else {
      val (kBlock, diagBlock) = kernelGen.computeKernel(data, colIdxs)
      if (cacheKernel) {
        colBlockCache += (colIdxs -> kBlock)
        diagBlockCache += (colIdxs -> diagBlock)
      }
      kBlock
    }
  }

  def unpersist(colIdxs: Seq[Int]): Unit = {
    if (colBlockCache.contains(colIdxs) && !cacheKernel) {
      colBlockCache(colIdxs).unpersist(true)
    }
  }

  def diagBlock(idxs: Seq[Int]): DenseMatrix[Double] = {
    if (!diagBlockCache.contains(idxs)) {
      val (kBlock, diagBlock) = kernelGen.computeKernel(data, idxs)
      if (cacheKernel) {
        colBlockCache += (idxs -> kBlock)
        diagBlockCache += (idxs -> diagBlock)
      }
      diagBlock
    } else {
      diagBlockCache(idxs)
    }
  }
} 
Example 162
Source File: LinearMapper.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import edu.berkeley.cs.amplab.mlmatrix.{NormalEquations, RowPartitionedMatrix}
import keystoneml.nodes.stats.{StandardScaler, StandardScalerModel}
import keystoneml.nodes.util.Densify
import org.apache.spark.rdd.RDD
import keystoneml.utils.MatrixUtils
import keystoneml.workflow.{LabelEstimator, Transformer}


object LinearMapEstimator extends Serializable {
  def apply(lambda: Option[Double] = None) = new LinearMapEstimator(lambda)

  def computeCost(
      trainingFeatures: RDD[DenseVector[Double]],
      trainingLabels: RDD[DenseVector[Double]],
      lambda: Double,
      x: DenseMatrix[Double],
      bOpt: Option[DenseVector[Double]]): Double = {

    val nTrain = trainingLabels.count
    val modelBroadcast = trainingLabels.context.broadcast(x)
    val bBroadcast = trainingLabels.context.broadcast(bOpt)

    val axb = trainingFeatures.mapPartitions(rows => {
      MatrixUtils.rowsToMatrixIter(rows).flatMap { rMat =>
        val mat = rMat * modelBroadcast.value
        val out = bBroadcast.value.map { b =>
          mat(*, ::) :+= b
          mat
        }.getOrElse(mat)

        MatrixUtils.matrixToRowArray(out).iterator
      }
    })

    val cost = axb.zip(trainingLabels).map { part =>
      val axb = part._1
      val labels = part._2
      val out = axb - labels
      math.pow(norm(out), 2)
    }.reduce(_ + _)

    if (lambda == 0) {
      cost/(2.0*nTrain.toDouble)
    } else {
      val wNorm = math.pow(norm(x.toDenseVector), 2)
      cost/(2.0*nTrain.toDouble) + lambda/2.0 * wNorm
    }
  }
} 
Example 163
Source File: LocalLeastSquaresEstimator.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import breeze.stats._
import keystoneml.nodes.stats.StandardScalerModel
import org.apache.spark.rdd.RDD
import keystoneml.utils.MatrixUtils
import keystoneml.workflow.LabelEstimator


  def trainWithL2(
   trainingFeatures: RDD[DenseVector[Double]],
   trainingLabels: RDD[DenseVector[Double]],
   lambda: Double): LinearMapper[DenseVector[Double]] = {

    val A_parts = trainingFeatures.mapPartitions { x =>
      MatrixUtils.rowsToMatrixIter(x)
    }.collect()
    val b_parts = trainingLabels.mapPartitions { x =>
      MatrixUtils.rowsToMatrixIter(x)
    }.collect()

    val A_local = DenseMatrix.vertcat(A_parts:_*)
    val b_local = DenseMatrix.vertcat(b_parts:_*)

    val featuresMean = mean(A_local(::, *)).t
    val labelsMean = mean(b_local(::, *)).t

    val A_zm = A_local(*, ::) - featuresMean
    val b_zm = b_local(*, ::) - labelsMean

    val AAt = A_zm * A_zm.t
    val model = A_zm.t * ( (AAt + (DenseMatrix.eye[Double](AAt.rows) :* lambda)) \ b_zm )
    LinearMapper(model, Some(labelsMean), Some(new StandardScalerModel(featuresMean, None)))
  }

} 
Example 164
Source File: LinearDiscriminantAnalysis.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import breeze.stats._
import org.apache.spark.rdd.RDD
import keystoneml.utils.MatrixUtils
import keystoneml.workflow.LabelEstimator


  override def fit(data: RDD[DenseVector[Double]], labels: RDD[Int]): LinearMapper[DenseVector[Double]] = {
    val sample = labels.zip(data).collect()
    computeLDA(sample)
  }

  def computeLDA(dataAndLabels: Array[(Int, DenseVector[Double])]): LinearMapper[DenseVector[Double]] = {
    val featuresByClass = dataAndLabels.groupBy(_._1).values.map(x => MatrixUtils.rowsToMatrix(x.map(_._2)))
    val meanByClass = featuresByClass.map(f => mean(f(::, *))) // each mean is a row vector, not col

    val sW = featuresByClass.zip(meanByClass).map(f => {
      val featuresMinusMean = f._1(*, ::) - f._2.t // row vector, not column
      featuresMinusMean.t * featuresMinusMean
    }).reduce(_+_)

    val numByClass = featuresByClass.map(_.rows : Double)
    val features = MatrixUtils.rowsToMatrix(dataAndLabels.map(_._2))
    val totalMean = mean(features(::, *)) // A row-vector, not a column-vector

    val sB = meanByClass.zip(numByClass).map {
      case (classMean, classNum) => {
        val m = classMean - totalMean
        (m.t * m) :* classNum
      }
    }.reduce(_+_)

    val eigen = eig((inv(sW): DenseMatrix[Double]) * sB)
    val eigenvectors = (0 until eigen.eigenvectors.cols).map(eigen.eigenvectors(::, _).toDenseMatrix.t)

    val topEigenvectors = eigenvectors.zip(eigen.eigenvalues.toArray).sortBy(x => -math.abs(x._2)).map(_._1).take(numDimensions)
    val W = DenseMatrix.horzcat(topEigenvectors:_*)

    new LinearMapper(W)
  }
} 
Example 165
Source File: LeastSquaresEstimator.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import keystoneml.nodes.util.{Densify, Sparsify}
import org.apache.spark.rdd.RDD
import keystoneml.pipelines.Logging
import keystoneml.workflow._

import scala.reflect._


class LeastSquaresEstimator[T <: Vector[Double]: ClassTag](
    lambda: Double = 0,
    numMachines: Option[Int] = None,
    cpuWeight: Double = 3.8e-4,
    memWeight: Double = 2.9e-1,
    networkWeight: Double = 1.32)
  extends OptimizableLabelEstimator[T, DenseVector[Double], DenseVector[Double]]
    with WeightedNode
    with Logging {

  val options: Seq[(CostModel, LabelEstimator[T, DenseVector[Double], DenseVector[Double]])] = Seq(
    {
      val solver = new DenseLBFGSwithL2[T](new LeastSquaresDenseGradient, regParam = lambda, numIterations = 20)
      (solver, solver)
    },
    {
      val solver = new SparseLBFGSwithL2(new LeastSquaresSparseGradient, regParam = lambda, numIterations = 20)
      (solver, TransformerLabelEstimatorChain(Sparsify(), solver))
    },
    {
      val solver = new BlockLeastSquaresEstimator(1000, 3, lambda = lambda)
      (solver, TransformerLabelEstimatorChain(Densify(), solver))
    },
    {
      val solver = new LinearMapEstimator(Some(lambda))
      (solver, TransformerLabelEstimatorChain(Densify(), solver))
    }
  )

  override val default: LabelEstimator[T, DenseVector[Double], DenseVector[Double]] with WeightedNode = {
    new DenseLBFGSwithL2[T](new LeastSquaresDenseGradient, regParam = lambda, numIterations = 20)
  }

  override def optimize(
      sample: RDD[T],
      sampleLabels: RDD[DenseVector[Double]],
      numPerPartition: Map[Int, Int])
  : LabelEstimator[T, DenseVector[Double], DenseVector[Double]] = {
    val n = numPerPartition.values.map(_.toLong).sum
    val d = sample.first().length
    val k = sampleLabels.first().length
    val sparsity = sample.map(x => x.activeSize.toDouble / x.length).sum() / sample.count()

    val realNumMachines = numMachines.getOrElse {
      if (sample.sparkContext.getExecutorStorageStatus.length == 1) {
        1
      } else {
        sample.sparkContext.getExecutorStorageStatus.length - 1
      }
    }

    logDebug(s"Optimizable Param n is $n")
    logDebug(s"Optimizable Param d is $d")
    logDebug(s"Optimizable Param k is $k")
    logDebug(s"Optimizable Param sparsity is $sparsity")
    logDebug(s"Optimizable Param numMachines is $realNumMachines")

    options.minBy(_._1.cost(n, d, k, sparsity, realNumMachines, cpuWeight, memWeight, networkWeight))._2
  }

  override val weight: Int = default.weight
} 
Example 166
Source File: SparseLinearMapper.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import org.apache.spark.rdd.RDD
import keystoneml.workflow.Transformer


  override def apply(in: RDD[SparseVector[Double]]): RDD[DenseVector[Double]] = {
    val modelBroadcast = in.context.broadcast(x)
    val bBroadcast = in.context.broadcast(bOpt)
    in.map(row => {
      val out = modelBroadcast.value.t * row
      bBroadcast.value.foreach { b =>
        out :+= b
      }

      out
    })
  }
} 
Example 167
Source File: ApproximatePCA.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import breeze.numerics._
import breeze.stats._
import breeze.stats.distributions.{Gaussian, ThreadLocalRandomGenerator, RandBasis}
import com.github.fommil.netlib.LAPACK._
import edu.berkeley.cs.amplab.mlmatrix.util.QRUtils
import org.apache.commons.math3.random.MersenneTwister
import org.apache.spark.rdd.RDD
import org.netlib.util.intW
import keystoneml.pipelines.Logging
import keystoneml.workflow.Estimator


  def approximateQ(A: DenseMatrix[Double], l: Int, q: Int, seed: Int = 0): DenseMatrix[Double] = {
    val d = A.cols

    val randBasis: RandBasis = new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(seed)))
    val omega = DenseMatrix.rand(d, l, Gaussian(0,1)(randBasis)) //cpu: d*l, mem: d*l
    val y0 = A*omega //cpu: n*d*l, mem: n*l

    var Q = QRUtils.qrQR(y0)._1 //cpu: n*l**2

    for (i <- 1 to q) {
      val YHat = Q.t * A //cpu: l*n*d, mem: l*d
      val Qh = QRUtils.qrQR(YHat.t)._1 //cpu: d*l^2, mem: d*l

      val Yj = A * Qh //cpu: n*d*l, mem: n*l
      Q = QRUtils.qrQR(Yj)._1 //cpu:  n*l^2, mem: n*l
    }

    Q
  }
} 
Example 168
Source File: DistributedPCA.scala    From keystone   with Apache License 2.0 5 votes vote down vote up
package keystoneml.nodes.learning

import breeze.linalg._
import breeze.numerics._
import breeze.stats._
import com.github.fommil.netlib.LAPACK.{getInstance => lapack}
import org.apache.spark.rdd.RDD
import org.netlib.util.intW
import keystoneml.pipelines._
import keystoneml.utils.MatrixUtils
import keystoneml.workflow.{Transformer, Estimator}

import edu.berkeley.cs.amplab.mlmatrix.{RowPartition, NormalEquations, RowPartitionedMatrix, TSQR}


  def fit(samples: RDD[DenseVector[Float]]): PCATransformer = {
    new PCATransformer(computePCA(samples, dims))
  }

  def computePCA(dataMat: RDD[DenseVector[Float]], dims: Int): DenseMatrix[Float] = {

    val mat = new RowPartitionedMatrix(dataMat.mapPartitions { part =>
      val dblIter = part.map(x => convert(x, Double))
      MatrixUtils.rowsToMatrixIter(dblIter).map(RowPartition(_))
    })
    val means = DenseVector(mat.colSums():_*) :/ mat.numRows().toDouble

    val meansBC = dataMat.context.broadcast(means)
    val zeroMeanMat = new RowPartitionedMatrix(mat.rdd.map { part =>
      RowPartition(part.mat(*, ::) - meansBC.value)
    })

    val rPart = new TSQR().qrR(zeroMeanMat)

    val svd.SVD(u, s, pcaT) = svd(rPart)

    val pca = convert(pcaT.t, Float)

    val matlabConventionPCA = PCAEstimator.enforceMatlabPCASignConvention(pca)

    // Return a subset of the columns.
    matlabConventionPCA(::, 0 until dims)
  }

  override def cost(
    n: Long,
    d: Int,
    k: Int,
    sparsity: Double,
    numMachines: Int,
    cpuWeight: Double,
    memWeight: Double,
    networkWeight: Double): Double = {
    val log2NumMachines = math.log(numMachines.toDouble) / math.log(2.0)
    val flops = n.toDouble * d * d / numMachines + d.toDouble * d * d * log2NumMachines
    val bytesScanned = n.toDouble * d
    val network = d.toDouble * d * log2NumMachines
    math.max(cpuWeight * flops, memWeight * bytesScanned) + networkWeight * network
  }
} 
Example 169
Source File: WrapperTrait.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package SparkER.Wrappers

import SparkER.DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import scala.collection.mutable.MutableList


  def rowToAttributes(columnNames: Array[String], row: Row, explodeInnerFields: Boolean = false, innerSeparator: String = ","): MutableList[KeyValue] = {
    val attributes: MutableList[KeyValue] = new MutableList()
    for (i <- 0 to row.size - 1) {
      try {
        val value = row(i)
        val attributeKey = columnNames(i)

        if (value != null) {
          value match {
            case listOfAttributes: Iterable[Any] =>
              listOfAttributes map {
                attributeValue =>
                  attributes += KeyValue(attributeKey, attributeValue.toString)
              }
            case stringAttribute: String =>
              if (explodeInnerFields) {
                stringAttribute.split(innerSeparator) map {
                  attributeValue =>
                    attributes += KeyValue(attributeKey, attributeValue)
                }
              }
              else {
                attributes += KeyValue(attributeKey, stringAttribute)
              }
            case singleAttribute =>
              attributes += KeyValue(attributeKey, singleAttribute.toString)
          }
        }
      }
      catch {
        case e: Throwable => println(e)
      }
    }
    attributes
  }
} 
Example 170
Source File: SerializedProfilesLoader.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package SparkER.Wrappers

import java.io.{IOException, _}

import SparkER.DataStructures.Profile
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


  def loadSerializedObject(fileName: String): Any = {
    var `object`: Any = null
    try {
      val file: InputStream = new FileInputStream(fileName)
      val buffer: InputStream = new BufferedInputStream(file)
      val input: ObjectInput = new ObjectInputStream(buffer)
      try {
        `object` = input.readObject
      } finally {
        input.close
      }
    }
    catch {
      case cnfEx: ClassNotFoundException => {
        System.err.println(fileName)
        cnfEx.printStackTrace
      }
      case ioex: IOException => {
        System.err.println(fileName)
        ioex.printStackTrace
      }
    }
    return `object`
  }
} 
Example 171
Source File: Converters.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package SparkER.Utilities

import SparkER.BlockBuildingMethods.TokenBlocking
import org.apache.spark.rdd.RDD
import SparkER.DataStructures._
import org.apache.spark.partial.PartialResult


  def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)
        }
    }

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorIDs.isEmpty) {
          BlockDirty(blockID, Array(profilesID))
        }
        else {
          BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs))
        }
    }

    blocks.filter(_.getComparisonSize() > 0).map(x => x)

  }
} 
Example 172
Source File: BlockFiltering.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package SparkER.BlockRefinementMethods

import SparkER.DataStructures.{BlockWithComparisonSize, ProfileBlocks}
import SparkER.Utilities.BoundedPriorityQueue
import org.apache.log4j.LogManager
import org.apache.spark.rdd.RDD


    }
  }

  def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = {
    profilesWithBlocks map {
      profileWithBlocks =>
        val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons)
        val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt
        val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons
        ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet)
    }
  }
} 
Example 173
Source File: SerializedObjectLoader.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Wrappers

import DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


object SerializedObjectLoader extends WrapperTrait {

  def loadProfiles(filePath: String, startIDFrom: Long = 0, realFieldID: String = "", sourceId: Int = 0): RDD[Profile] = {
    @transient lazy val log = org.apache.log4j.LogManager.getRootLogger

    log.info("SPARKER - Start to loading entities")
    val entities = DataLoaders.SerializedLoader.loadSerializedDataset(filePath)
    log.info("SPARKER - Loading ended")

    log.info("SPARKER - Start to generate profiles")
    val profiles: Array[Profile] = new Array(entities.size())

    for (i <- 0 until entities.size()) {
      val profile = Profile(id = i + startIDFrom, originalID = i + "", sourceId = sourceId)

      val entity = entities.get(i)
      val it = entity.getAttributes.iterator()
      while (it.hasNext) {
        val attribute = it.next()
        profile.addAttribute(KeyValue(attribute.getName, attribute.getValue))
      }

      profiles.update(i, profile)
    }
    log.info("SPARKER - Ended to loading profiles")

    log.info("SPARKER - Start to parallelize profiles")
    val sc = SparkContext.getOrCreate()

    sc.union(profiles.grouped(10000).map(sc.parallelize(_)).toArray)
  }

  def loadGroundtruth(filePath: String): RDD[MatchingEntities] = {

    val groundtruth = DataLoaders.SerializedLoader.loadSerializedGroundtruth(filePath)

    val matchingEntitites: Array[MatchingEntities] = new Array(groundtruth.size())

    var i = 0

    val it = groundtruth.iterator
    while (it.hasNext) {
      val matching = it.next()
      matchingEntitites.update(i, MatchingEntities(matching.getEntityId1.toString, matching.getEntityId2.toString))
      i += 1
    }

    val sc = SparkContext.getOrCreate()
    sc.union(matchingEntitites.grouped(10000).map(sc.parallelize(_)).toArray)
  }
} 
Example 174
Source File: WrapperTrait.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Wrappers

import DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import scala.collection.mutable.MutableList


  def rowToAttributes(columnNames: Array[String], row: Row, explodeInnerFields: Boolean = false, innerSeparator: String = ","): MutableList[KeyValue] = {
    val attributes: MutableList[KeyValue] = new MutableList()
    for (i <- 0 to row.size - 1) {
      try {
        val value = row(i)
        val attributeKey = columnNames(i)

        if (value != null) {
          value match {
            case listOfAttributes: Iterable[Any] =>
              listOfAttributes map {
                attributeValue =>
                  attributes += KeyValue(attributeKey, attributeValue.toString)
              }
            case stringAttribute: String =>
              if (explodeInnerFields) {
                stringAttribute.split(innerSeparator) map {
                  attributeValue =>
                    attributes += KeyValue(attributeKey, attributeValue)
                }
              }
              else {
                attributes += KeyValue(attributeKey, stringAttribute)
              }
            case singleAttribute =>
              attributes += KeyValue(attributeKey, singleAttribute.toString)
          }
        }
      }
      catch {
        case e: Throwable => println(e)
      }
    }
    attributes
  }
} 
Example 175
Source File: SerializedProfilesLoader.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Wrappers

import java.io.{IOException, _}

import DataStructures.Profile
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


  def loadSerializedObject(fileName: String): Any = {
    var `object`: Any = null
    try {
      val file: InputStream = new FileInputStream(fileName)
      val buffer: InputStream = new BufferedInputStream(file)
      val input: ObjectInput = new ObjectInputStream(buffer)
      try {
        `object` = input.readObject
      } finally {
        input.close
      }
    }
    catch {
      case cnfEx: ClassNotFoundException => {
        System.err.println(fileName)
        cnfEx.printStackTrace
      }
      case ioex: IOException => {
        System.err.println(fileName)
        ioex.printStackTrace
      }
    }
    return `object`
  }
} 
Example 176
Source File: Converters.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Utilities

import BlockBuildingMethods.TokenBlocking
import org.apache.spark.rdd.RDD
import DataStructures._
import org.apache.spark.partial.PartialResult


  def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)
        }
    }

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorIDs.isEmpty) {
          BlockDirty(blockID, Array(profilesID))
        }
        else {
          BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs))
        }
    }

    blocks.filter(_.getComparisonSize() >= 1).map(x => x)

  }
} 
Example 177
Source File: BlockFiltering.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package BlockRefinementMethods

import DataStructures.{BlockWithComparisonSize, ProfileBlocks}
import Utilities.BoundedPriorityQueue
import org.apache.log4j.LogManager
import org.apache.spark.rdd.RDD


    }
  }

  def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = {
    profilesWithBlocks map {
      profileWithBlocks =>
        val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons)
        val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt
        val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons
        ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet)
    }
  }
} 
Example 178
Source File: SerializedObjectLoader.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Wrappers

import DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


object SerializedObjectLoader extends  WrapperTrait{

  def loadProfiles(filePath : String, startIDFrom : Long = 0, realFieldID : String = "") : RDD[Profile] = {
    @transient lazy val log = org.apache.log4j.LogManager.getRootLogger

    log.info("SPARKER - Start to loading entities")
    val entities = DataLoaders.SerializedLoader.loadSerializedDataset(filePath)
    log.info("SPARKER - Loading ended")

    log.info("SPARKER - Start to generate profiles")
    val profiles : Array[Profile] = new Array(entities.size())

    for(i <- 0 to entities.size()-1){
      val profile = Profile(id = i+startIDFrom, originalID = i+"")

      val entity = entities.get(i)
      val it = entity.getAttributes.iterator()
      while(it.hasNext){
        val attribute = it.next()
        profile.addAttribute(KeyValue(attribute.getName, attribute.getValue))
      }

      profiles.update(i, profile)
    }
    log.info("SPARKER - Ended to loading profiles")

    log.info("SPARKER - Start to parallelize profiles")
    val sc = SparkContext.getOrCreate()

    sc.union(profiles.grouped(10000).map(sc.parallelize(_)).toArray)
  }

  def loadGroundtruth(filePath : String) : RDD[MatchingEntities] = {

    val groundtruth = DataLoaders.SerializedLoader.loadSerializedGroundtruth(filePath)

    val matchingEntitites : Array[MatchingEntities] = new Array(groundtruth.size())

    var i = 0

    val it = groundtruth.iterator
    while(it.hasNext){
      val matching = it.next()
      matchingEntitites.update(i, MatchingEntities(matching.getEntityId1.toString, matching.getEntityId2.toString))
      i+=1
    }

    val sc = SparkContext.getOrCreate()
    sc.union(matchingEntitites.grouped(10000).map(sc.parallelize(_)).toArray)
  }
} 
Example 179
Source File: WrapperTrait.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Wrappers

import DataStructures.{KeyValue, MatchingEntities, Profile}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import scala.collection.mutable.MutableList


  def rowToAttributes(columnNames : Array[String], row : Row, explodeInnerFields:Boolean = false, innerSeparator : String = ",") : MutableList[KeyValue] = {
    val attributes: MutableList[KeyValue] = new MutableList()
    for(i <- 0 to row.size-1){
      try{
        val value = row(i)
        val attributeKey = columnNames(i)

        if(value != null){
          value match {
            case listOfAttributes : Iterable[Any] =>
              listOfAttributes map {
                attributeValue =>
                  attributes += KeyValue(attributeKey, attributeValue.toString)
              }
            case stringAttribute : String =>
              if(explodeInnerFields){
                stringAttribute.split(innerSeparator) map {
                  attributeValue =>
                    attributes += KeyValue(attributeKey, attributeValue)
                }
              }
              else {
                attributes += KeyValue(attributeKey, stringAttribute)
              }
            case singleAttribute =>
              attributes += KeyValue(attributeKey, singleAttribute.toString)
          }
        }
      }
      catch{
        case e : Throwable => println(e)
      }
    }
    attributes
  }
} 
Example 180
Source File: SerializedProfilesLoader.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Wrappers

import java.io.{IOException, _}

import DataStructures.Profile
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD


  def loadSerializedObject(fileName: String): Any = {
    var `object`: Any = null
    try {
      val file: InputStream = new FileInputStream(fileName)
      val buffer: InputStream = new BufferedInputStream(file)
      val input: ObjectInput = new ObjectInputStream(buffer)
      try {
        `object` = input.readObject
      } finally {
        input.close
      }
    }
    catch {
      case cnfEx: ClassNotFoundException => {
        System.err.println(fileName)
        cnfEx.printStackTrace
      }
      case ioex: IOException => {
        System.err.println(fileName)
        ioex.printStackTrace
      }
    }
    return `object`
  }
} 
Example 181
Source File: Converters.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package Utilities

import org.apache.spark.rdd.RDD
import DataStructures._
import org.apache.spark.partial.PartialResult


  def profilesBlockToBlocks(profilesBlocks : RDD[ProfileBlocks], separatorID : Long = -1) : RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)
        }
    }

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorID < 0){
          BlockDirty(blockID, (profilesID, Set.empty))
        }
        else{
          BlockClean(blockID, (profilesID.partition(_ <= separatorID)))
        }
    }

    blocks.filter(_.getComparisonSize() >=1).map(x => x)

  }
 } 
Example 182
Source File: BlockFiltering.scala    From sparker   with GNU General Public License v3.0 5 votes vote down vote up
package BlockRefinementMethods

import DataStructures.{BlockWithComparisonSize, ProfileBlocks}
import Utilities.BoundedPriorityQueue
import org.apache.log4j.LogManager
import org.apache.spark.rdd.RDD


    }
  }

  def blockFilteringAdvanced(profilesWithBlocks: RDD[ProfileBlocks], r: Double, minCardinality: Int = 1): RDD[ProfileBlocks] = {
    profilesWithBlocks map {
      profileWithBlocks =>
        val blocksSortedByComparisons = profileWithBlocks.blocks.toList.sortWith(_.comparisons < _.comparisons)
        val blocksToKeep = Math.round(blocksSortedByComparisons.size * r).toInt
        val threshold = blocksSortedByComparisons(blocksToKeep-1).comparisons
        ProfileBlocks(profileWithBlocks.profileID, blocksSortedByComparisons.filter(_.comparisons <= threshold).toSet)
    }
  }
} 
Example 183
Source File: CNNModel.scala    From SparkMLlibDeepLearn   with Apache License 2.0 5 votes vote down vote up
package CNN

import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV
}
import org.apache.spark.rdd.RDD


  def Loss(predict: RDD[PredictCNNLabel]): Double = {
    val predict1 = predict.map(f => f.error)
    // error and loss
    // ���������
    val loss1 = predict1
    val (loss2, counte) = loss1.treeAggregate((0.0, 0L))(
      seqOp = (c, v) => {
        // c: (e, count), v: (m)
        val e1 = c._1
        val e2 = (v :* v).sum
        val esum = e1 + e2
        (esum, c._2 + 1)
      },
      combOp = (c1, c2) => {
        // c: (e, count)
        val e1 = c1._1
        val e2 = c2._1
        val esum = e1 + e2
        (esum, c1._2 + c2._2)
      })
    val Loss = (loss2 / counte.toDouble) * 0.5
    Loss
  }

} 
Example 184
Source File: NeuralNetModel.scala    From SparkMLlibDeepLearn   with Apache License 2.0 5 votes vote down vote up
package NN

import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV
}
import org.apache.spark.rdd.RDD


  def Loss(predict: RDD[PredictNNLabel]): Double = {
    val predict1 = predict.map(f => f.error)
    // error and loss
    // ���������
    val loss1 = predict1
    val (loss2, counte) = loss1.treeAggregate((0.0, 0L))(
      seqOp = (c, v) => {
        // c: (e, count), v: (m)
        val e1 = c._1
        val e2 = (v :* v).sum
        val esum = e1 + e2
        (esum, c._2 + 1)
      },
      combOp = (c1, c2) => {
        // c: (e, count)
        val e1 = c1._1
        val e2 = c2._1
        val esum = e1 + e2
        (esum, c1._2 + c2._2)
      })
    val Loss = loss2 / counte.toDouble
    Loss * 0.5
  }

} 
Example 185
Source File: DBNModel.scala    From SparkMLlibDeepLearn   with Apache License 2.0 5 votes vote down vote up
package DBN

import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV
}
import org.apache.spark.rdd.RDD
import scala.collection.mutable.ArrayBuffer

class DBNModel(
  val config: DBNConfig,
  val dbn_W: Array[BDM[Double]],
  val dbn_b: Array[BDM[Double]],
  val dbn_c: Array[BDM[Double]]) extends Serializable {

  
  def dbnunfoldtonn(outputsize: Int): (Array[Int], Int, Array[BDM[Double]]) = {
    //1 size layer ����ת��
    val size = if (outputsize > 0) {
      val size1 = config.size
      val size2 = ArrayBuffer[Int]()
      size2 ++= size1
      size2 += outputsize
      size2.toArray
    } else config.size
    val layer = if (outputsize > 0) config.layer + 1 else config.layer
    
    //2 dbn_W ����ת��
    var initW = ArrayBuffer[BDM[Double]]()
    for (i <- 0 to dbn_W.length - 1) {
      initW += BDM.horzcat(dbn_c(i), dbn_W(i))
    }
    (size, layer, initW.toArray)
  }

} 
Example 186
Source File: StringKeyRDD.scala    From cuesheet   with Apache License 2.0 5 votes vote down vote up
package com.kakao.cuesheet.convert

import java.nio.charset.StandardCharsets.UTF_8

import com.kakao.mango.concurrent._
import com.kakao.mango.couchbase.Couchbase
import com.kakao.mango.hbase.HBase
import com.kakao.mango.json._
import com.kakao.mango.util.Retry
import org.apache.spark.rdd.RDD

import scala.concurrent.duration._


class StringKeyRDD[T](rdd: RDD[(String, T)]) extends SaveToES(rdd) {

  def saveToCouchbase(nodes: Seq[String], bucket: String, expiry: Int = 0, maxRate: Double = 1e7, password: String = null): Unit = {
    // rate per executor
    val rate = maxRate / rdd.sparkContext.getExecutorMemoryStatus.size

    rdd.foreachPartition { partition =>
      // BackPressureException may happen, so retry 10 times
      // if that fails, Spark task scheduler may retry again.
      val cluster = Couchbase(nodes: _*)
      val client = cluster.bucket(bucket, password)

      val converted = partition.map {
        case (key, value: Array[Byte]) => (key, new String(value, UTF_8))
        case (key, value: String) => (key, value)
        case (key, value) => (key, toJson(value))
      }

      for (group <- converted.grouped(1000)) {
        Retry(10, 100.millis) {
          client.putAll(group, rate, expiry).sync()
        }
      }

      cluster.disconnect()
    }
  }

  def saveToHBase(quorum: String, table: String, family: String, qualifier: String, maxRate: Double = 1e7): Unit = {
    // rate per executor
    val rate = maxRate / rdd.sparkContext.getExecutorMemoryStatus.size

    rdd.foreachPartition { partition =>
      val hbase = HBase(quorum)
      val column = hbase.column(table, family, qualifier)

      val converted = partition.map {
        case (key, value: Array[Byte]) => (key.getBytes(UTF_8), value)
        case (key, value: String) => (key.getBytes(UTF_8), value.getBytes(UTF_8))
        case (key, value) => (key.getBytes(UTF_8), serialize(value))
      }

      for (group <- converted.grouped(1000)) {
        Retry(10, 100.millis) {
          column.putAllBytes(group, rate).sync()
        }
      }
    }
  }
} 
Example 187
Source File: HBaseReaders.scala    From cuesheet   with Apache License 2.0 5 votes vote down vote up
package com.kakao.cuesheet.convert

import com.kakao.mango.util.Conversions._
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.collection.JavaConversions._

trait HBaseReaders {
  val sc: SparkContext

  
  def hbaseTable(quorum: String, table: String): RDD[(String, ((String, String), (Long, String)))] = {
    hbaseTableBinary(quorum, table).map {
      case (rowkey, ((family, qualifier), (timestamp, value))) =>
        (rowkey.string, ((family.string, qualifier.string), (timestamp, value.string)))
    }
  }

  def hbaseColumnBinary(quorum: String, table: String, family: Array[Byte], qualifier: Array[Byte]): RDD[(Array[Byte], (Long, Array[Byte]))] = {
    hbaseTableBinary(quorum, table).collect {
      case (rowkey, ((f, q), cell)) if family.sameElements(f) && qualifier.sameElements(q) => (rowkey, cell)
    }
  }

  def hbaseColumn(quorum: String, table: String, family: String, qualifier: String): RDD[(String, (Long, String))] = {
    hbaseTable(quorum, table).collect {
      case (rowkey, ((f, q), cell)) if family == f && qualifier == q => (rowkey, cell)
    }
  }
} 
Example 188
Source File: JoinableRDD.scala    From cuesheet   with Apache License 2.0 5 votes vote down vote up
package com.kakao.cuesheet.convert

import org.apache.spark.HashPartitioner
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

class JoinableRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) {

  def selfJoin(numPartitions: Int = rdd.partitions.length): RDD[(K, (V, V))] = fastJoin(rdd, numPartitions)

  def fastJoin[W](other: RDD[(K, W)], numPartitions: Int = rdd.partitions.length): RDD[(K, (V, W))] = {
    val partitioner = new HashPartitioner(numPartitions)
    val grouped = rdd cogroup other

    val left = grouped.flatMap{
      case (k, (vs, ws)) => vs.zipWithIndex.map {
        case (v, idx) => ((k, idx), v)
      }
    }.partitionBy(partitioner)

    val right = grouped.flatMap {
      case (k, (vs, ws)) => ws.map { w => ((k, w.hashCode()), (w, vs.size)) }
    }.partitionBy(partitioner).flatMap {
      case ((k, r), (w, size)) => (0 until size).map(i => ((k, w), i))
    }.map {
      case ((k, w), idx) => ((k, idx), w)
    }

    (left join right).map {
      case ((k, idx), (v, w)) => (k, (v, w))
    }
  }

} 
Example 189
Source File: SavingStream.scala    From cuesheet   with Apache License 2.0 5 votes vote down vote up
package com.kakao.cuesheet.convert

import com.kakao.mango.concurrent.{NamedExecutors, RichExecutorService}
import com.kakao.mango.text.ThreadSafeDateFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, DataFrame}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream

import java.util.concurrent.{Future => JFuture}
import scala.reflect.runtime.universe.TypeTag

object SavingStream {
  val yyyyMMdd = ThreadSafeDateFormat("yyyy-MM-dd")
  val hh = ThreadSafeDateFormat("HH")
  val mm = ThreadSafeDateFormat("mm")
  val m0 = (ms: Long) => mm(ms).charAt(0) + "0"
}


  @transient var executor: RichExecutorService = _

  def ex: RichExecutorService = {
    if (executor == null) {
      this.synchronized {
        if (executor == null) {
          executor = new RichExecutorService(es.get())
        }
      }
    }
    executor
  }

  def saveAsPartitionedTable(table: String, path: String, format: String = "orc")(toPartition: Time => Seq[(String, String)]): Unit = {
    stream.foreachRDD { (rdd, time) =>
      ex.submit {
        toDF(rdd).appendToExternalTablePartition(table, path, format, toPartition(time): _*)
      }
    }
  }

  def saveAsDailyPartitionedTable(table: String, path: String, dateColumn: String = "date", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms))
    }
  }

  def saveAsHourlyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms))
    }
  }

  def saveAsTenMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> m0(ms))
    }
  }

  def saveAsMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> mm(ms))
    }
  }

}

class ProductStream[T <: Product : TypeTag](stream: DStream[T])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[T](stream) {
  override def toDF(rdd: RDD[T]) = ctx.createDataFrame(rdd)
}

class JsonStream(stream: DStream[String])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[String](stream) {
  override def toDF(rdd: RDD[String]) = ctx.read.json(rdd)
}

class MapStream[T](stream: DStream[Map[String, T]])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[Map[String, T]](stream) {
  import com.kakao.mango.json._

  override def toDF(rdd: RDD[Map[String, T]]) = ctx.read.json(rdd.map(toJson))
}

class RowStream(stream: DStream[Row])(implicit ctx: HiveContext, es: ExecutorSupplier, schema: StructType) extends SavingStream[Row](stream) {
  override def toDF(rdd: RDD[Row]): DataFrame = ctx.createDataFrame(rdd, schema)
} 
Example 190
Source File: MemsqlRDD.scala    From memsql-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.memsql.spark

import java.sql.{Connection, PreparedStatement, ResultSet}

import com.memsql.spark.SQLGen.VariableList
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils}
import org.apache.spark.sql.types._
import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}

case class MemsqlRDD(query: String,
                     variables: VariableList,
                     options: MemsqlOptions,
                     schema: StructType,
                     expectedOutput: Seq[Attribute],
                     @transient val sc: SparkContext)
    extends RDD[Row](sc, Nil) {

  override protected def getPartitions: Array[Partition] =
    MemsqlQueryHelpers.GetPartitions(options, query, variables)

  override def compute(rawPartition: Partition, context: TaskContext): Iterator[Row] = {
    var closed                     = false
    var rs: ResultSet              = null
    var stmt: PreparedStatement    = null
    var conn: Connection           = null
    var partition: MemsqlPartition = rawPartition.asInstanceOf[MemsqlPartition]

    def tryClose(name: String, what: AutoCloseable): Unit = {
      try {
        if (what != null) { what.close() }
      } catch {
        case e: Exception => logWarning(s"Exception closing $name", e)
      }
    }

    def close(): Unit = {
      if (closed) { return }
      tryClose("resultset", rs)
      tryClose("statement", stmt)
      tryClose("connection", conn)
      closed = true
    }

    context.addTaskCompletionListener { context =>
      close()
    }

    conn = JdbcUtils.createConnectionFactory(partition.connectionInfo)()
    stmt = conn.prepareStatement(partition.query)
    JdbcHelpers.fillStatement(stmt, partition.variables)
    rs = stmt.executeQuery()

    var rowsIter = JdbcUtils.resultSetToRows(rs, schema)

    if (expectedOutput.nonEmpty) {
      val schemaDatatypes   = schema.map(_.dataType)
      val expectedDatatypes = expectedOutput.map(_.dataType)

      if (schemaDatatypes != expectedDatatypes) {
        val columnEncoders = schemaDatatypes.zip(expectedDatatypes).zipWithIndex.map {
          case ((_: StringType, _: NullType), _)     => ((_: Row) => null)
          case ((_: ShortType, _: BooleanType), i)   => ((r: Row) => r.getShort(i) != 0)
          case ((_: IntegerType, _: BooleanType), i) => ((r: Row) => r.getInt(i) != 0)
          case ((_: LongType, _: BooleanType), i)    => ((r: Row) => r.getLong(i) != 0)

          case ((l, r), i) => {
            options.assert(l == r, s"MemsqlRDD: unable to encode ${l} into ${r}")
            ((r: Row) => r.get(i))
          }
        }

        rowsIter = rowsIter
          .map(row => Row.fromSeq(columnEncoders.map(_(row))))
      }
    }

    CompletionIterator[Row, Iterator[Row]](new InterruptibleIterator[Row](context, rowsIter), close)
  }

} 
Example 191
Source File: KMeanTest.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector}

import scala.util.Random


//spark/bin/spark-submit --master spark://10.100.34.48:7077 --class  ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9

//guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class  ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15

object ScalableKMeanTest {

  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}")
    val sc = new SparkContext(conf)

    val k = args(0).toInt
    val dimension = args(1).toInt
    val recordNum = args(2).toInt
    val sparsity = args(3).toDouble
    val iterations = args(4).toInt
    val means = args(5)
    val parNumber = args(6).toInt

    val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => {
      val ran = new Random()
      val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray
      val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray
      val vec: Vector = new SparseVector(dimension, indexArr, valueArr)
      vec
    }).cache()
    println(args.mkString(", "))
    println(data.count() + " records generated")

    val st = System.nanoTime()

    val model = if(means == "my") {
      println("running scalable kmeans")
      val model = new ScalableKMeans()
        .setK(k)
        .setInitializationMode("random")
        .setMaxIterations(iterations)
        .run(data)
      model
    } else {
      println("running mllib kmeans")
      val model = new KMeans()
        .setK(k)
        .setInitializationMode("random")
        .setMaxIterations(iterations)
        .run(data)
      model
    }

    println((System.nanoTime() - st) / 1e9 + " seconds cost")
    println("final clusters: " + model.clusterCenters.length)
    println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n"))

    sc.stop()
  }

} 
Example 192
Source File: LRUtils.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.sparselr.Utils

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

object LRUtils {
  
  def bytes2Int (buffer: Array[Byte], pos: Int): (Int, Int) = {
    var result: Int = 0
    var position: Int = pos
    var byte = buffer(pos)
    var shiftNum = 0

    while ((byte & 0x80) != 0) {
      result = result | ((byte & 0x7F)<<shiftNum)
      position += 1
      byte = buffer(position)
      shiftNum += 7
    }
    result = result | ((byte & 0x7F)<<shiftNum)
    (result, position)
  }

  //featureId cached in X is localId
  def loadFileAsMatrix(
                sc: SparkContext,
                path: String,
                minPartitions: Int): RDD[(Array[Double], Matrix)] = {
    val lines = sc.textFile(path, minPartitions)
      .map(_.trim)
      .filter(line => !(line.isEmpty || line.startsWith("#")))

    val data = lines.mapPartitions { samples =>
      val labels = new PrimitiveVector[Double]()
      val builder = new MatrixBuilder()

      samples.foreach { line =>
        val items = line.split(' ')

        labels += items.head.toDouble

        val featureIdAndValues = items.tail.filter(_.nonEmpty)

        val indices = new PrimitiveVector[Int]()
        val values = new PrimitiveVector[Float]()
        featureIdAndValues.foreach { item =>
          val featureAndValue = item.split(":")
          indices += featureAndValue(0).toInt
          val value = featureAndValue(1).toFloat
          values += value
        }
        builder.add(new SparseVector(indices.trim.array, values.trim.array))
      }
      Iterator((labels.trim.array, builder.toMatrix))
    }
    data
  }
} 
Example 193
Source File: LogisticRegression.scala    From SparseML   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.sparselr

import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap
import org.apache.spark.mllib.sparselr.Utils._
import org.apache.spark.SparkEnv
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

object LogisticRegression {
    def train(input: RDD[(Array[Double], Matrix)],
              optimizer: Optimizer
              ): (Array[Int], Array[Double]) = {

      val hdfsIndex2global = new Int2IntOpenHashMap()
      var index = 0

      input.map { point =>
        point._2 match {
          case x: CompressedSparseMatrix =>
            println("x.length" + x.mappings.length)
          case _ =>
            throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.")
        }
      }.count

      val global2hdfsIndex = input.map { point =>
        point._2 match {
          case x: CompressedSparseMatrix =>
            x.mappings
          case _ =>
            throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.")
        }
      }.collect().flatMap(t => t).distinct

      global2hdfsIndex.foreach{value =>
        hdfsIndex2global.put(value, index)
        index += 1
      }

      val bcHdfsIndex2global = input.context.broadcast(hdfsIndex2global)

      val examples = input.map(global2globalMapping(bcHdfsIndex2global)).cache()

      val numTraining = examples.count()
      println(s"Training: $numTraining.")

      SparkEnv.get.blockManager.removeBroadcast(bcHdfsIndex2global.id, true)

      val examplesTest = examples.mapPartitions(_.flatMap {
        case (y, part) => part.asInstanceOf[CompressedSparseMatrix].tupletIterator(y)})

      val weights = Vectors.dense(new Array[Double](global2hdfsIndex.size))

      val newWeights = optimizer.optimize(examplesTest, weights)

      ((global2hdfsIndex, newWeights.toArray))
    }

  //globalId to localId for mappings in Matrix
    def global2globalMapping(bchdfsIndex2global: Broadcast[Int2IntOpenHashMap])
                     (partition: (Array[Double], Matrix)): (Array[Double], Matrix) = {
      val hdfsIndex2global = bchdfsIndex2global.value

      partition._2 match {
        case x: CompressedSparseMatrix =>
          val local2hdfsIndex = x.mappings
          for (i <- 0 until local2hdfsIndex.length) {
            local2hdfsIndex(i) = hdfsIndex2global.get(local2hdfsIndex(i))
          }
        case _ =>
          throw new IllegalArgumentException(s"dot doesn't support ${partition.getClass}.")
      }
      partition
    }
} 
Example 194
Source File: OneWayANOVA.scala    From StatisticsOnSpark   with Apache License 2.0 5 votes vote down vote up
package main.ANOVA

import org.apache.commons.math3.distribution.FDistribution
import org.apache.spark.rdd.RDD



  def anovaPValue(categoryData: Iterable[RDD[Double]]): Double = {
    val anovaStats = getAnovaStats(categoryData)

    val fdist: FDistribution = new FDistribution(null, anovaStats.dfbg, anovaStats.dfwg)
    return 1.0 - fdist.cumulativeProbability(anovaStats.F)
  }

  private case class ANOVAStats(dfbg: Double, dfwg: Double, F: Double)

  private def getAnovaStats(categoryData: Iterable[RDD[Double]]): ANOVAStats = {
    var dfwg: Long = 0
    var sswg: Double = 0
    var totsum: Double = 0
    var totsumsq: Double = 0
    var totnum: Long = 0

    for (data <- categoryData) {
      val sum: Double = data.sum()
      val sumsq: Double = data.map(i => i * i).sum()
      val num = data.count()
      totnum += num
      totsum += sum
      totsumsq += sumsq
      dfwg += num - 1
      val ss: Double = sumsq - ((sum * sum) / num)
      sswg += ss
    }

    val sst: Double = totsumsq - ((totsum * totsum) / totnum)
    val ssbg: Double = sst - sswg
    val dfbg: Int = categoryData.size - 1
    val msbg: Double = ssbg / dfbg
    val mswg: Double = sswg / dfwg
    val F: Double = msbg / mswg
    ANOVAStats(dfbg, dfwg, F)
  }


} 
Example 195
Source File: TwoSampleIndependentTTest.scala    From StatisticsOnSpark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.TDistribution
import org.apache.commons.math3.util.FastMath
import org.apache.spark.rdd.RDD


  def tTest(sample1: RDD[Double], sample2: RDD[Double]): Double = {
    val n1 = sample1.count()
    val n2 = sample2.count()
    val m1 = sample1.sum() / n1
    val m2 = sample2.sum() / n2
    val v1 = sample1.map(d => (d - m1) * (d - m1)).sum() / (n1 - 1)
    val v2 = sample2.map(d => (d - m2) * (d - m2)).sum() / (n2 - 1)
    val t: Double = math.abs((m1 - m2) / FastMath.sqrt((v1 / n1) + (v2 / n2)))
    val degreesOfFreedom: Double = (((v1 / n1) + (v2 / n2)) * ((v1 / n1) + (v2 / n2))) /
      ((v1 * v1) / (n1 * n1 * (n1 - 1d)) + (v2 * v2) / (n2 * n2 * (n2 - 1d)))

    // pass a null rng to avoid unneeded overhead as we will not sample from this distribution
    val distribution: TDistribution = new TDistribution(null, degreesOfFreedom)
    2.0 * distribution.cumulativeProbability(-t)
  }

} 
Example 196
Source File: EtlProcessor.scala    From etl-light   with MIT License 5 votes vote down vote up
package yamrcraft.etlite.processors

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.DefaultDecoder
import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.kafka._
import org.slf4j.LoggerFactory
import yamrcraft.etlite.Settings
import yamrcraft.etlite.state.{KafkaOffsetsState, KafkaStateManager}
import yamrcraft.etlite.transformers.InboundMessage

object EtlProcessor {

  val logger = LoggerFactory.getLogger(this.getClass)

  def run(settings: Settings) = {
    val context = createContext(settings)

    val stateManager = new KafkaStateManager(settings.etl.state)

    val lastState = stateManager.readState
    logger.info(s"last persisted state: $lastState")

    val currState = stateManager.fetchNextState(lastState, settings)
    logger.info(s"batch working state: $currState")

    val rdd = createRDD(context, currState, settings)
    processRDD(rdd, currState.jobId, settings)

    logger.info("committing state")
    stateManager.commitState(currState)
  }

  private def createContext(settings: Settings) = {
    val sparkConf = new SparkConf()
      .setAppName(settings.spark.appName)
      .setAll(settings.spark.conf)

    new SparkContext(sparkConf)
  }

  private def createRDD(context: SparkContext, state: KafkaOffsetsState, settings: Settings): RDD[InboundMessage] = {
    KafkaUtils.createRDD[Array[Byte], Array[Byte], DefaultDecoder, DefaultDecoder, InboundMessage](
      context,
      settings.kafka.properties,
      state.ranges.toArray,
      Map[TopicAndPartition, Broker](),
      (msgAndMeta: MessageAndMetadata[Array[Byte], Array[Byte]]) => { InboundMessage(msgAndMeta.topic, msgAndMeta.key(), msgAndMeta.message()) }
    )
  }

  private def processRDD(kafkaRDD: RDD[InboundMessage], jobId: Long, settings: Settings) = {
    // passed to remote workers
    val etlSettings = settings.etl

    logger.info(s"RDD processing started [rdd=${kafkaRDD.id}, jobId=$jobId]")

    val rdd = settings.etl.maxNumOfOutputFiles.map(kafkaRDD.coalesce(_)).getOrElse(kafkaRDD)

    rdd.foreachPartition { partition =>
        // executed at the worker
        new PartitionProcessor(jobId, TaskContext.get.partitionId(), etlSettings)
          .processPartition(partition)
      }

    logger.info(s"RDD processing ended [rdd=${kafkaRDD.id}, jobId=$jobId]")
  }


} 
Example 197
Source File: YahooParser.scala    From spark-timeseries   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.sparkts.parsers

import com.cloudera.sparkts.TimeSeries
import com.cloudera.sparkts.TimeSeries._
import java.time._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

object YahooParser {
  def yahooStringToTimeSeries(
    text: String,
    keyPrefix: String = "",
    zone: ZoneId = ZoneId.systemDefault())
    : TimeSeries[String] = {
    val lines = text.split('\n')
    val labels = lines(0).split(',').tail.map(keyPrefix + _)
    val samples = lines.tail.map { line =>
      val tokens = line.split(',')
      val dt = LocalDate.parse(tokens.head).atStartOfDay(zone)
      (dt, tokens.tail.map(_.toDouble))
    }.reverse
    timeSeriesFromIrregularSamples(samples, labels, zone)
  }

  def yahooFiles(
    dir: String,
    sc: SparkContext,
    zone: ZoneId = ZoneId.systemDefault())
    : RDD[TimeSeries[String]] = {
    sc.wholeTextFiles(dir).map { case (path, text) =>
      YahooParser.yahooStringToTimeSeries(text, path.split('/').last, zone)
    }
  }
} 
Example 198
Source File: DatasourceRDD.scala    From datasource-receiver   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.datasource.receiver

import org.apache.spark.partial.{BoundedDouble, CountEvaluator, PartialResult}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.streaming.datasource.config.ParametersUtils
import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetOperator}
import org.apache.spark.{Logging, Partition, TaskContext}

private[datasource]
class DatasourceRDD(
                     @transient sqlContext: SQLContext,
                     inputSentences: InputSentences,
                     datasourceParams: Map[String, String]
                   ) extends RDD[Row](sqlContext.sparkContext, Nil) with Logging with ParametersUtils {

  private var totalCalculated: Option[Long] = None

  private val InitTableName = "initTable"
  private val LimitedTableName = "limitedTable"
  private val TempInitQuery = s"select * from $InitTableName"

  val dataFrame = inputSentences.offsetConditions.fold(sqlContext.sql(inputSentences.query)) { case offset =>
    val parsedQuery = parseInitialQuery
    val conditionsSentence = offset.fromOffset.extractConditionSentence(parsedQuery)
    val orderSentence = offset.fromOffset.extractOrderSentence(parsedQuery, inverse = offset.limitRecords.isEmpty)
    val limitSentence = inputSentences.extractLimitSentence

    sqlContext.sql(parsedQuery + conditionsSentence + orderSentence + limitSentence)
  }

  private def parseInitialQuery: String = {
    if (inputSentences.query.toUpperCase.contains("WHERE") ||
      inputSentences.query.toUpperCase.contains("ORDER") ||
      inputSentences.query.toUpperCase.contains("LIMIT")
    ) {
      sqlContext.sql(inputSentences.query).registerTempTable(InitTableName)
      TempInitQuery
    } else inputSentences.query
  }

  def progressInputSentences: InputSentences = {
    if (!dataFrame.rdd.isEmpty()) {
      inputSentences.offsetConditions.fold(inputSentences) { case offset =>

        val offsetValue = if (offset.limitRecords.isEmpty)
          dataFrame.rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name))
        else {
          dataFrame.registerTempTable(LimitedTableName)
          val limitedQuery = s"select * from $LimitedTableName order by ${offset.fromOffset.name} " +
            s"${OffsetOperator.toInverseOrderOperator(offset.fromOffset.operator)} limit 1"

          sqlContext.sql(limitedQuery).rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name))
        }

        inputSentences.copy(offsetConditions = Option(offset.copy(fromOffset = offset.fromOffset.copy(
          value = Option(offsetValue),
          operator = OffsetOperator.toProgressOperator(offset.fromOffset.operator)))))
      }
    } else inputSentences
  }

  
  override def isEmpty(): Boolean = {
    totalCalculated.fold {
      withScope {
        partitions.length == 0 || take(1).length == 0
      }
    } { total => total == 0L }
  }

  override def getPartitions: Array[Partition] = dataFrame.rdd.partitions

  override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = dataFrame.rdd.compute(thePart, context)

  override def getPreferredLocations(thePart: Partition): Seq[String] = dataFrame.rdd.preferredLocations(thePart)
} 
Example 199
Source File: JsonInputStreamQuery.scala    From spark-cep   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming.examples

import scala.collection.mutable.SynchronizedQueue
import scala.io.Source

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.streaming.StreamSQLContext
import org.apache.spark.streaming.{Duration, StreamingContext}


object JsonInputStreamQuery {
  def main(args: Array[String]): Unit = {
    val ssc = new StreamingContext("local[10]", "test", Duration(3000))
    val sc = ssc.sparkContext
    val streamSqlContext = new StreamSQLContext(ssc, new SQLContext(sc))
    import streamSqlContext._
    // Here we read data line by line from a given file and then put it into a queue DStream.
    // You can replace any kind of String type DStream here including kafka DStream.
    val queue = new SynchronizedQueue[RDD[String]]()
    Source.fromFile("src/main/resources/student.json").getLines().foreach(msg =>
      queue.enqueue(sc.parallelize(List(msg))))
    val queueDStream = ssc.queueStream[String](queue)
    // We can infer the schema of json automatically by using inferJsonSchema
    val schema = streamSqlContext.inferJsonSchema("src/main/resources/student.json")
    streamSqlContext.registerDStreamAsTable(
      streamSqlContext.jsonDStream(queueDStream, schema), "jsonTable")
    sql("SELECT * FROM jsonTable").print()
    ssc.start()
    ssc.awaitTerminationOrTimeout(30 * 1000)
    ssc.stop()
  }
} 
Example 200
Source File: ExistingDStream.scala    From spark-cep   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.streaming

import org.apache.spark.rdd.{EmptyRDD, RDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream


private[streaming]
case class PhysicalDStream(output: Seq[Attribute], @transient stream: DStream[InternalRow])
    extends SparkPlan with StreamPlan {

  def children = Nil

  override def doExecute() = {
    assert(validTime != null)
    Utils.invoke(classOf[DStream[InternalRow]], stream, "getOrCompute", (classOf[Time], validTime))
      .asInstanceOf[Option[RDD[InternalRow]]]
      .getOrElse(new EmptyRDD[InternalRow](sparkContext))
  }
}