org.apache.spark.HashPartitioner Scala Examples

The following examples show how to use org.apache.spark.HashPartitioner. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: EdgeRDDImpl.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{HashPartitioner, OneToOneDependency}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

} 
Example 2
Source File: PregelNWeight.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.graph.nweight

import scala.collection.JavaConversions._
import org.apache.spark.SparkContext
import org.apache.spark.HashPartitioner
import org.apache.spark.storage.StorageLevel
import org.apache.spark.graphx._
import org.apache.spark.graphx.impl.GraphImpl
import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap



object PregelNWeight extends Serializable{

  def sendMsg(edge: EdgeTriplet[SizedPriorityQueue, Double]) = {
    val m = new Long2DoubleOpenHashMap()
    val w1 = edge.attr
    val id = edge.srcId
    edge.dstAttr.foreach{ case (target, wn) =>
      if (target != id)
        m.put(target, wn*w1)
    }
    Iterator((id, m))
  }

  def mergMsg(c1: Long2DoubleOpenHashMap, c2: Long2DoubleOpenHashMap) = {
    c2.long2DoubleEntrySet()
      .fastIterator()
      .foreach(pair =>
        c1.put(pair.getLongKey(), c1.get(pair.getLongKey()) + pair.getDoubleValue()))
    c1
  }

  def vProg(id: VertexId, vdata: SizedPriorityQueue, msg: Long2DoubleOpenHashMap) = {
    vdata.clear()
    if (msg.size > 0) {
      msg.long2DoubleEntrySet().fastIterator().foreach { pair =>
        val src = pair.getLongKey()
        val wn = pair.getDoubleValue()
        vdata.enqueue((src, wn))
      }
      vdata
    } else {
      vdata.enqueue((id, 1))
      vdata 
    }
  }

  def nweight(sc: SparkContext, input: String, output: String, step: Int,
    maxDegree: Int, numPartitions: Int, storageLevel: StorageLevel) {

    //val start1 = System.currentTimeMillis
    val part = new HashPartitioner(numPartitions)
    val edges = sc.textFile(input, numPartitions).flatMap { line =>
      val fields = line.split("\\s+", 2)
      val src = fields(0).trim.toLong

      fields(1).split("[,\\s]+").filter(_.isEmpty() == false).map { pairStr =>
        val pair = pairStr.split(":")
        val (dest, weight) = (pair(0).trim.toLong, pair(1).toDouble)
        (src, Edge(src, dest, weight))
      }
    }.partitionBy(part).map(_._2)

    var g = GraphImpl(edges, new SizedPriorityQueue(maxDegree), storageLevel, storageLevel).cache()

    g = Pregel(g, new Long2DoubleOpenHashMap, step, EdgeDirection.In)(
      vProg, sendMsg, mergMsg)

    g.vertices.map { case (vid, vdata) => 
      var s = new StringBuilder
      s.append(vid)

      vdata.foreach { r =>
        s.append(' ')
        s.append(r._1)
        s.append(':')
        s.append(r._2)
      }
      s.toString
    }.saveAsTextFile(output)
  }
} 
Example 3
Source File: GraphxNWeight.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.graph.nweight

import scala.collection.JavaConversions._
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.HashPartitioner
import org.apache.spark.storage.StorageLevel
import org.apache.spark.graphx._
import org.apache.spark.graphx.impl.GraphImpl
import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap



object GraphxNWeight extends Serializable{

  def mapF(edge: EdgeContext[SizedPriorityQueue, Double, Long2DoubleOpenHashMap]) = {
    val theMap = new Long2DoubleOpenHashMap()
    val edgeAttribute = edge.attr
    val id = edge.srcId
    edge.dstAttr.foreach{ case (target, wn) =>
      if (target != id)
        theMap.put(target, wn * edgeAttribute)
    }
    edge.sendToSrc(theMap)
  }

  def reduceF(c1: Long2DoubleOpenHashMap, c2: Long2DoubleOpenHashMap) = {
    c2.long2DoubleEntrySet()
      .fastIterator()
      .foreach(pair => c1.put(pair.getLongKey(), c1.get(pair.getLongKey()) + pair.getDoubleValue()))
    c1
  }

  def updateF(id: VertexId, vdata: SizedPriorityQueue, msg: Option[Long2DoubleOpenHashMap]) = {
    vdata.clear()
    val weightMap = msg.orNull
    if (weightMap != null) {
      weightMap.long2DoubleEntrySet().fastIterator().foreach { pair =>
        val src = pair.getLongKey()
        val wn = pair.getDoubleValue()
        vdata.enqueue((src, wn))
      }
    }
    vdata
  }

  def nweight(sc: SparkContext, input: String, output: String, step: Int,
    maxDegree: Int, numPartitions: Int, storageLevel: StorageLevel) {

    //val start1 = System.currentTimeMillis
    val part = new HashPartitioner(numPartitions)
    val edges = sc.textFile(input, numPartitions).flatMap { line =>
      val fields = line.split("\\s+", 2)
      val src = fields(0).trim.toLong

      fields(1).split("[,\\s]+").filter(_.isEmpty() == false).map { pairStr =>
        val pair = pairStr.split(":")
        val (dest, weight) = (pair(0).trim.toLong, pair(1).toDouble)
        (src, Edge(src, dest, weight))
      }
    }.partitionBy(part).map(_._2)

    val vertices = edges.map { e =>
      (e.srcId, (e.dstId, e.attr))
    }.groupByKey(part).map { case (id, seq) =>
      val vdata = new SizedPriorityQueue(maxDegree)
      seq.foreach(vdata.enqueue)
      (id, vdata)
    }

    var g = GraphImpl(vertices, edges, new SizedPriorityQueue(maxDegree), storageLevel, storageLevel).cache()

    var msg: RDD[(VertexId, Long2DoubleOpenHashMap)] = null
    for (i <- 2 to step) {
      msg = g.aggregateMessages(mapF, reduceF)
      g = g.outerJoinVertices(msg)(updateF).persist(storageLevel)
    }

    g.vertices.map { case (vid, vdata) => 
      var s = new StringBuilder
      s.append(vid)

      vdata.foreach { r =>
        s.append(' ')
        s.append(r._1)
        s.append(':')
        s.append(r._2)
      }
      s.toString
    }.saveAsTextFile(output)
  }
} 
Example 4
Source File: DBHPartitioner.scala    From zen   with Apache License 2.0 5 votes vote down vote up
package com.github.cloudml.zen.ml.partitioner

import scala.reflect.ClassTag

import com.github.cloudml.zen.ml.clustering.LDADefines._

import org.apache.spark.HashPartitioner
import org.apache.spark.graphx2._
import org.apache.spark.graphx2.impl.GraphImpl
import org.apache.spark.storage.StorageLevel


  def getKey(et: EdgeTriplet[Int, _]): Long = {
    val srcId = et.srcId
    val dstId = et.dstId
    val srcDeg = et.srcAttr
    val dstDeg = et.dstAttr
    val maxDeg = math.max(srcDeg, dstDeg)
    val minDegId = if (maxDeg == srcDeg) dstId else srcId
    val maxDegId = if (maxDeg == srcDeg) srcId else dstId
    if (maxDeg < threshold) {
      maxDegId
    } else {
      minDegId
    }
  }

  override def equals(other: Any): Boolean = other match {
    case dbh: DBHPartitioner =>
      dbh.numPartitions == numPartitions
    case _ =>
      false
  }
}

object DBHPartitioner {
  def partitionByDBH[VD: ClassTag, ED: ClassTag](input: Graph[VD, ED],
    storageLevel: StorageLevel): Graph[VD, ED] = {
    val edges = input.edges
    val conf = edges.context.getConf
    val numPartitions = conf.getInt(cs_numPartitions, edges.partitions.length)
    val dbh = new DBHPartitioner(numPartitions, 0)
    val degGraph = GraphImpl(input.degrees, edges)
    val newEdges = degGraph.triplets.mapPartitions(_.map(et =>
      (dbh.getKey(et), Edge(et.srcId, et.dstId, et.attr))
    )).partitionBy(dbh).map(_._2)
    GraphImpl(input.vertices, newEdges, null.asInstanceOf[VD], storageLevel, storageLevel)
  }
} 
Example 5
Source File: EdgeDstPartitioner.scala    From zen   with Apache License 2.0 5 votes vote down vote up
package com.github.cloudml.zen.ml.partitioner

import scala.reflect.ClassTag

import com.github.cloudml.zen.ml.clustering.LDADefines._

import org.apache.spark.HashPartitioner
import org.apache.spark.graphx2._
import org.apache.spark.graphx2.impl.GraphImpl
import org.apache.spark.storage.StorageLevel


class EdgeDstPartitioner(val partitions: Int) extends HashPartitioner(partitions) {

  @inline def getKey(et: EdgeTriplet[_, _]): Long = et.dstId

  override def equals(other: Any): Boolean = other match {
    case edp: EdgeDstPartitioner =>
      edp.numPartitions == numPartitions
    case _ =>
      false
  }
}

object EdgeDstPartitioner {
  def partitionByEDP[VD: ClassTag, ED: ClassTag](input: Graph[VD, ED],
    storageLevel: StorageLevel): Graph[VD, ED] = {
    val edges = input.edges
    val conf = edges.context.getConf
    val numPartitions = conf.getInt(cs_numPartitions, edges.partitions.length)
    val edp = new EdgeDstPartitioner(numPartitions)
    val newEdges = input.triplets.mapPartitions(_.map(et =>
      (edp.getKey(et), Edge(et.srcId, et.dstId, et.attr))
    )).partitionBy(edp).map(_._2)
    GraphImpl(input.vertices, newEdges, null.asInstanceOf[VD], storageLevel, storageLevel)
  }
} 
Example 6
Source File: EdgeRDDImpl.scala    From zen   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.graphx2.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{OneToOneDependency, HashPartitioner}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx2._

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

} 
Example 7
Source File: UtilSpark.scala    From Clustering4Ever   with Apache License 2.0 5 votes vote down vote up
package org.clustering4ever.sparktools

import scala.language.higherKinds
import org.apache.spark.rdd.RDD
import org.apache.spark.HashPartitioner
import scala.reflect.runtime.universe.TypeTag
import scala.util.Random
import scala.reflect.ClassTag
import scala.collection.{GenSeq, mutable}
import org.clustering4ever.preprocessing.Preprocessable
import org.clustering4ever.hashing.HashingScalar
import org.clustering4ever.vectors.{GVector, ScalarVector}

object UtilSpark
{

	type IndexPartition = Int
	type HasConverged = Boolean
    type IsOriginalDot = Boolean


	final def generateDataLocalityOnHashsedDS[
		O,
		Pz[B, C <: GVector[C]] <: Preprocessable[B, C, Pz]
	](
		rddToPartitioned: RDD[Pz[O, ScalarVector]],
		nbblocs1: Int,
		nbBucketRange: Int
	): RDD[(IndexPartition, (Pz[O, ScalarVector], IsOriginalDot, HasConverged))] = {
		val isOriginalPoint = true
		val hasConverged = true
		val bucketRange = 1 to nbBucketRange

		val lshRDD = rddToPartitioned.map((_, isOriginalPoint, !hasConverged))

		val localityPerPartitionRDD = lshRDD.mapPartitionsWithIndex{ (idx, it) =>
			val ar = it.toList
			def rightNeighbourhood = ar.flatMap{ case (cz, _, _) => bucketRange.collect{ case i if(idx + i < nbblocs1) => (idx + i, (cz, !isOriginalPoint, !hasConverged)) } }
			def leftNeighbourhood = ar.flatMap{ case (cz, _, _) => bucketRange.collect{ case i if(idx - i >= 0) => (idx - i, (cz, !isOriginalPoint, !hasConverged)) } }
			val composing = if(idx == 0) ar.map((idx, _)) ::: rightNeighbourhood
				else if(idx == nbblocs1 - 1) ar.map((idx, _)) ::: leftNeighbourhood
				else ar.map((idx, _)) ::: leftNeighbourhood ::: rightNeighbourhood

	      composing.toIterator

	    }.partitionBy(new HashPartitioner(nbblocs1))
	    
	    localityPerPartitionRDD
	}

	final def generateDataLocalityLD[
		O,
		Pz[B, C <: GVector[C]] <: Preprocessable[B, C, Pz],
		Hasher <: HashingScalar
	](
		rddToPartitioned: RDD[Pz[O, ScalarVector]],
		hashing: Hasher,
		nbblocs1: Int,
		nbBucketRange: Int
	): RDD[(IndexPartition, (Pz[O, ScalarVector], IsOriginalDot, HasConverged))] = {
		val hashedRDD = rddToPartitioned.sortBy( cz => hashing.hf(cz.v) , ascending = true, nbblocs1 )
		generateDataLocalityOnHashsedDS(hashedRDD, nbblocs1, nbBucketRange)
	}

} 
Example 8
Source File: PartitionBy.scala    From learning-spark   with Apache License 2.0 5 votes vote down vote up
package com.javachen.spark.examples.rdd

import org.apache.spark.{RangePartitioner,HashPartitioner, SparkContext}

object PartitionBy {
  def main(args: Array[String]) {

    val sc = new SparkContext("local", "ReduceByKeyToDriver Test")
    val data1 = Array[(String, Int)](("K", 1), ("T", 2),
      ("T", 3), ("W", 4),
      ("W", 5), ("W", 6)
    )
    val pairs = sc.parallelize(data1, 3)
    //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
    //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2))
    var result = pairs.partitionBy(new RangePartitioner(2, pairs, true))
    result = pairs.partitionBy(new HashPartitioner(2))
    result.foreach(println)
  }
} 
Example 9
Source File: EdgeRDDImpl.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{OneToOneDependency, HashPartitioner}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx._

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

} 
Example 10
Source File: StatefulNetworkWordCount.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.streaming

import org.apache.spark.SparkConf
import org.apache.spark.HashPartitioner
import org.apache.spark.streaming._


object StatefulNetworkWordCount {
  def main(args: Array[String]) {
    if (args.length < 2) {
      System.err.println("Usage: StatefulNetworkWordCount <hostname> <port>")
      System.exit(1)
    }

    StreamingExamples.setStreamingLogLevels()

    val sparkConf = new SparkConf().setAppName("StatefulNetworkWordCount")
    // Create the context with a 1 second batch size
    val ssc = new StreamingContext(sparkConf, Seconds(1))
    ssc.checkpoint(".")

    // Initial state RDD for mapWithState operation
    val initialRDD = ssc.sparkContext.parallelize(List(("hello", 1), ("world", 1)))

    // Create a ReceiverInputDStream on target ip:port and count the
    // words in input stream of \n delimited test (eg. generated by 'nc')
    val lines = ssc.socketTextStream(args(0), args(1).toInt)
    val words = lines.flatMap(_.split(" "))
    val wordDstream = words.map(x => (x, 1))

    // Update the cumulative count using mapWithState
    // This will give a DStream made of state (which is the cumulative count of the words)
    val mappingFunc = (word: String, one: Option[Int], state: State[Int]) => {
      val sum = one.getOrElse(0) + state.getOption.getOrElse(0)
      val output = (word, sum)
      state.update(sum)
      output
    }

    val stateDstream = wordDstream.mapWithState(
      StateSpec.function(mappingFunc).initialState(initialRDD))
    stateDstream.print()
    ssc.start()
    ssc.awaitTermination()
  }
}
// scalastyle:on println 
Example 11
Source File: EdgeRDDImpl.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{HashPartitioner, OneToOneDependency}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

} 
Example 12
Source File: EdgeRDDImpl.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{OneToOneDependency, HashPartitioner}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx._

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

} 
Example 13
Source File: EdgeRDDImpl.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{OneToOneDependency, HashPartitioner}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx._

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

} 
Example 14
Source File: StatefulNetworkWordCount.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.streaming

import org.apache.spark.SparkConf
import org.apache.spark.HashPartitioner
import org.apache.spark.streaming._


object StatefulNetworkWordCount {
  def main(args: Array[String]) {
    if (args.length < 2) {
      System.err.println("Usage: StatefulNetworkWordCount <hostname> <port>")
      System.exit(1)
    }

    StreamingExamples.setStreamingLogLevels()

    val updateFunc = (values: Seq[Int], state: Option[Int]) => {
      val currentCount = values.sum

      val previousCount = state.getOrElse(0)

      Some(currentCount + previousCount)
    }

    val newUpdateFunc = (iterator: Iterator[(String, Seq[Int], Option[Int])]) => {
      iterator.flatMap(t => updateFunc(t._2, t._3).map(s => (t._1, s)))
    }

    val sparkConf = new SparkConf().setAppName("StatefulNetworkWordCount")
    // Create the context with a 1 second batch size
    val ssc = new StreamingContext(sparkConf, Seconds(1))
    ssc.checkpoint(".")

    // Initial RDD input to updateStateByKey
    val initialRDD = ssc.sparkContext.parallelize(List(("hello", 1), ("world", 1)))

    // Create a ReceiverInputDStream on target ip:port and count the
    // words in input stream of \n delimited test (eg. generated by 'nc')
    val lines = ssc.socketTextStream(args(0), args(1).toInt)
    val words = lines.flatMap(_.split(" "))
    val wordDstream = words.map(x => (x, 1))

    // Update the cumulative count using updateStateByKey
    // This will give a Dstream made of state (which is the cumulative count of the words)
    val stateDstream = wordDstream.updateStateByKey[Int](newUpdateFunc,
      new HashPartitioner (ssc.sparkContext.defaultParallelism), true, initialRDD)
    stateDstream.print()
    ssc.start()
    ssc.awaitTermination()
  }
} 
Example 15
Source File: EdgeRDDImpl.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{HashPartitioner, OneToOneDependency}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

} 
Example 16
Source File: JoinableRDD.scala    From cuesheet   with Apache License 2.0 5 votes vote down vote up
package com.kakao.cuesheet.convert

import org.apache.spark.HashPartitioner
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag

class JoinableRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)]) {

  def selfJoin(numPartitions: Int = rdd.partitions.length): RDD[(K, (V, V))] = fastJoin(rdd, numPartitions)

  def fastJoin[W](other: RDD[(K, W)], numPartitions: Int = rdd.partitions.length): RDD[(K, (V, W))] = {
    val partitioner = new HashPartitioner(numPartitions)
    val grouped = rdd cogroup other

    val left = grouped.flatMap{
      case (k, (vs, ws)) => vs.zipWithIndex.map {
        case (v, idx) => ((k, idx), v)
      }
    }.partitionBy(partitioner)

    val right = grouped.flatMap {
      case (k, (vs, ws)) => ws.map { w => ((k, w.hashCode()), (w, vs.size)) }
    }.partitionBy(partitioner).flatMap {
      case ((k, r), (w, size)) => (0 until size).map(i => ((k, w), i))
    }.map {
      case ((k, w), idx) => ((k, idx), w)
    }

    (left join right).map {
      case ((k, idx), (v, w)) => (k, (v, w))
    }
  }

} 
Example 17
Source File: L3-DStreamWindowAndAction.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.HashPartitioner

object RedditWindowAndActionApp {
  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        "Usage: RedditWindowAndActionApp <appname> <input_path>")
      System.exit(1)
    }
    val Seq(appName, inputPath) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val checkpointPath = "/tmp"
    ssc.checkpoint(checkpointPath)
    val updateFunc = (values: Seq[Int], state: Option[Int]) => {
      val currentCount = values.sum
      val previousCount = state.getOrElse(0)
      Some(currentCount + previousCount)
    }
    val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1))
    val globalCount = keyedBySubredditState.updateStateByKey(updateFunc)
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString)
    val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5))
    val windowedCounts = windowedRecs.countByValue()

    windowedCounts.print(10)
    windowedCounts.saveAsObjectFiles("subreddit", "obj")
    windowedCounts.saveAsTextFiles("subreddit", "txt")

    globalCount.saveAsHadoopFiles("subreddit", "hadoop",
      classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]])
    globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop",
      classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]])
    comments.foreachRDD(rdd => {
      LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count()))
    })

    ssc.start()
    ssc.awaitTermination()

  }
} 
Example 18
Source File: L3-DStreamKeyValue.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.HashPartitioner

object RedditKeyValueApp {
  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: RedditKeyValueApp <appname> <input_path> <input_path_popular>")
      System.exit(1)
    }
    val Seq(appName, inputPath, inputPathPopular) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val popular = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPathPopular, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val topAuthors = comments.map(rec => ((parse(rec) \ "author").values.toString, 1))
      .groupByKey()
      .map(r => (r._2.sum, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val topAuthors2 = comments.map(rec => ((parse(rec) \ "author").values.toString, 1))
      .reduceByKey(_ + _)
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val topAuthorsByAvgContent = comments.map(rec => ((parse(rec) \ "author").values.toString, (parse(rec) \ "body").values.toString.split(" ").length))
      .combineByKey(
        (v) => (v, 1),
        (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1),
        (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2),
        new HashPartitioner(ssc.sparkContext.defaultParallelism))
      .map({ case (k, v) => (k, v._1 / v._2.toFloat) })
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val keyedBySubreddit = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec))
    val keyedBySubreddit2 = popular.map(rec => ({
      val t = rec.split(",")
      (t(1).split("/")(4), t(0))
    }))
    val commentsWithIndustry = keyedBySubreddit.join(keyedBySubreddit2)

    val keyedBySubredditCo = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec))
    val keyedBySubredditCo2 = popular.map(rec => ({
      val t = rec.split(",")
      (t(1).split("/")(4), t(0))
    }))
    val commentsWithIndustryCo = keyedBySubreddit.cogroup(keyedBySubreddit2)

    val checkpointPath = "/tmp"
    ssc.checkpoint(checkpointPath)
    val updateFunc = (values: Seq[Int], state: Option[Int]) => {
      val currentCount = values.sum
      val previousCount = state.getOrElse(0)
      Some(currentCount + previousCount)
    }
    val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1))
    val globalCount = keyedBySubredditState.updateStateByKey(updateFunc)
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    ssc.start()
    ssc.awaitTermination()

  }
} 
Example 19
Source File: L10-2DataProc.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.HashPartitioner
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.json4s.DefaultFormats
import org.json4s.JsonAST.JNothing
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object DataProcApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: DataProcApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    ssc.socketTextStream(hostname, port.toInt)
      .map(r => {
        implicit val formats = DefaultFormats
        parse(r)
      })
      .filter(jvalue => {
        jvalue \ "attributes" \ "Wi-Fi" != JNothing
      })
      .map(jvalue => {
        implicit val formats = DefaultFormats
        ((jvalue \ "attributes" \ "Wi-Fi").extract[String], (jvalue \ "stars").extract[Int])
      })
      .combineByKey(
        (v) => (v, 1),
        (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1),
        (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2),
        new HashPartitioner(ssc.sparkContext.defaultParallelism))
      .map({ case (k, v) => (k, v._1 / v._2.toFloat) })
      .print()

    ssc.start()
    ssc.awaitTermination()
  }

} 
Example 20
package com.tomekl007.chapter_5

import com.tomekl007.UserTransaction
import org.apache.spark.sql.SparkSession
import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkContext}
import org.scalatest.FunSuite

class CustomRangePartitionerTest extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should use custom range partitioner") {
    //given
    val keysWithValuesList =
      Array(
        UserTransaction("A", 100),
        UserTransaction("B", 4),
        UserTransaction("A", 100001),
        UserTransaction("B", 10),
        UserTransaction("C", 10)
      )
    val data = spark.parallelize(keysWithValuesList)
    val keyed = data.keyBy(_.amount)

    //when, then
    val partitioned = keyed.partitionBy(new CustomRangePartitioner(List((0,100), (100, 10000), (10000, 1000000))))

    //then
    partitioned.collect().toList
  }
}

class CustomRangePartitioner(ranges: List[(Int,Int)]) extends Partitioner{
  override def numPartitions: Int = ranges.size

  override def getPartition(key: Any): Int = {
    if(!key.isInstanceOf[Int]){
      throw new IllegalArgumentException("partitioner works only for Int type")
    }
    val keyInt = key.asInstanceOf[Int]
    val index = ranges.lastIndexWhere(v => keyInt >= v._1 && keyInt <= v._2)
    println(s"for key: $key return $index")
    index
  }
} 
Example 21
Source File: UsePartitioner.scala    From Hands-On-Big-Data-Analytics-with-PySpark   with MIT License 5 votes vote down vote up
package com.tomekl007.chapter_5

import com.tomekl007.UserTransaction
import org.apache.spark.{HashPartitioner, RangePartitioner, SparkContext}
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class UsePartitioner extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("should use different partitioners") {
    //given
    val keysWithValuesList =
      Array(
        UserTransaction("A", 100),
        UserTransaction("B", 4),
        UserTransaction("A", 100001),
        UserTransaction("B", 10),
        UserTransaction("C", 10)
      )
    val data = spark.parallelize(keysWithValuesList)
    val keyed = data.keyBy(_.userId)

    //when, then
    val partitioner = keyed.partitioner
    assert(partitioner.isEmpty)

    val hashPartitioner = keyed.partitionBy(new HashPartitioner(100))
    println(hashPartitioner)
    assert(hashPartitioner.partitioner.isDefined)

    val rangePartitioner = keyed.partitionBy(new RangePartitioner(100, keyed))
    println(rangePartitioner)
    assert(rangePartitioner.partitioner.isDefined)

  }
} 
Example 22
package com.tomekl007.chapter_3

import org.apache.spark.sql.SparkSession
import org.apache.spark.{HashPartitioner, SparkContext}
import org.scalatest.FunSuite
import org.scalatest.Matchers._

class ExecutionPlanForJoins extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext
  test("should use custom partitioner while join") {
    //given
    val transactions = spark.makeRDD(List((1, "bag"), (2, "dog"), (4, "car")))
    val persons = spark.makeRDD(List((1, "Tom"), (2, "Michael"), (3, "Johnny")))

    //when
    val personsDataPartitioner = transactions.partitioner match {
      case Some(p) => p
      case None => new HashPartitioner(persons.partitions.length)
    }


    val res = persons.join(transactions, personsDataPartitioner).collect().toList

    res should contain theSameElementsAs
      List((2, ("Michael", "dog")), (1, ("Tom", "bag")))
  }

  test("can broadcast small data set to every executor and join in-memory") {
    //given
    val smallDataSet = spark.makeRDD(List((1, "bag"), (2, "dog"), (4, "car")))
    val hugeDataSet = spark.makeRDD(List((1, "Tom"), (2, "Michael"), (3, "Johnny")))

    //when broadcast small rdd to all executors
    val smallInMemoryDataSet = spark.broadcast(smallDataSet.collectAsMap())

    //then join will not need to do shuffle
    val res = hugeDataSet.mapPartitions(iter => {
      iter.flatMap {
        case (k, v1) => smallInMemoryDataSet.value.get(k) match {
          case None => Seq.empty
          case Some(v2) => Seq((k, (v1, v2)))
        }
      }
    })

    res.collect().toList should contain theSameElementsAs
      List((2, ("Michael", "dog")), (1, ("Tom", "bag")))
  }

} 
Example 23
Source File: VectorRDDFunctions.scala    From spark-vl-bfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.optim

import scala.language.implicitConversions

import org.apache.spark.HashPartitioner
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg._

class VectorRDDFunctions(self: RDD[Vector]) {

  def treeSum(depth: Int = 2): RDD[Vector] = {
    val zeroValue: Vector = null
    val seqOp = (s: Vector, v: Vector) => {
      if (s != null) {
        BLAS.axpy(1.0, v, s)
        s
      } else {
        v.copy.toDense
      }
    }
    val combOp = (s1: Vector, s2: Vector) => {
      // TODO: handle empty partitions
      BLAS.axpy(1.0, s2, s1)
      s1
    }
    require(depth >= 1, s"Depth must be greater than or equal to 1 but got $depth.")
    val aggregatePartition = (it: Iterator[Vector]) => it.aggregate(zeroValue)(seqOp, combOp)
    var partiallyAggregated = self.mapPartitions(it => Iterator(aggregatePartition(it)))
    var numPartitions = partiallyAggregated.partitions.length
    val scale = math.max(math.pow(numPartitions, 1.0 / depth), 2.0)
    while (numPartitions > 1) {
      numPartitions = math.ceil(numPartitions / scale).toInt
      val curNumPartitions = numPartitions
      partiallyAggregated = partiallyAggregated.mapPartitionsWithIndex {
        (i, iter) => iter.map((i % curNumPartitions, _))
      }.reduceByKey(new HashPartitioner(curNumPartitions), combOp)
      .values
    }
    require(partiallyAggregated.partitions.length == 1)
    partiallyAggregated
  }
}

object VectorRDDFunctions {
  implicit def fromVectorRDD(rdd: RDD[Vector]): VectorRDDFunctions = new VectorRDDFunctions(rdd)
} 
Example 24
Source File: ContinuousCoalesceExec.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.continuous

import java.util.UUID

import org.apache.spark.{HashPartitioner, SparkEnv}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeRow}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.streaming.continuous.shuffle.{ContinuousShuffleReadPartition, ContinuousShuffleReadRDD}


case class ContinuousCoalesceExec(numPartitions: Int, child: SparkPlan) extends SparkPlan {
  override def output: Seq[Attribute] = child.output

  override def children: Seq[SparkPlan] = child :: Nil

  override def outputPartitioning: Partitioning = SinglePartition

  override def doExecute(): RDD[InternalRow] = {
    assert(numPartitions == 1)
    new ContinuousCoalesceRDD(
      sparkContext,
      numPartitions,
      conf.continuousStreamingExecutorQueueSize,
      sparkContext.getLocalProperty(ContinuousExecution.EPOCH_INTERVAL_KEY).toLong,
      child.execute())
  }
} 
Example 25
Source File: VRDDFunctionsSuite.scala    From spark-vlbfgs   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.rdd

import scala.collection.mutable

import org.apache.spark.{HashPartitioner, SparkFunSuite}
import org.apache.spark.ml.linalg.distributed.{DistributedVectorPartitioner, VGridPartitioner}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class VRDDFunctionsSuite extends SparkFunSuite with MLlibTestSparkContext {

  import org.apache.spark.rdd.VRDDFunctions._

  override def beforeAll(): Unit = {
    super.beforeAll()
  }

  def testMapJoinPartitions(shuffleRdd2: Boolean): Unit = {
    val sc = spark.sparkContext
    val rdd1 = sc.parallelize(Array.tabulate(81) {
      idx => {
        val rowIdx = idx % 9
        val colIdx = idx / 9
        ((rowIdx, colIdx), (rowIdx, colIdx))
      }
    }).partitionBy(VGridPartitioner(9, 9, 3, 3)).cache()
    rdd1.count()
    val rdd2 = sc.parallelize(Array.tabulate(9)(idx => (idx, idx)))
      .partitionBy(new DistributedVectorPartitioner(9)).cache()
    rdd2.count()

    val rddr = rdd1.mapJoinPartition(rdd2, shuffleRdd2)(
      (x: Int) => {
        val blockColIdx = x / 3
        val pos = blockColIdx * 3
        Array(pos, pos + 1, pos + 2)
      },
      (p1: Int, iter1, list: Array[(Int, Iterator[(Int, Int)])]) => {
        Iterator((p1, list.map(tuple => (tuple._1, tuple._2.next())).mkString(",")))
      }
    )

    assert(rddr.collect() === Array(
      (0, "(0,(0,0)),(1,(1,1)),(2,(2,2))"),
      (1, "(0,(0,0)),(1,(1,1)),(2,(2,2))"),
      (2, "(0,(0,0)),(1,(1,1)),(2,(2,2))"),
      (3, "(3,(3,3)),(4,(4,4)),(5,(5,5))"),
      (4, "(3,(3,3)),(4,(4,4)),(5,(5,5))"),
      (5, "(3,(3,3)),(4,(4,4)),(5,(5,5))"),
      (6, "(6,(6,6)),(7,(7,7)),(8,(8,8))"),
      (7, "(6,(6,6)),(7,(7,7)),(8,(8,8))"),
      (8, "(6,(6,6)),(7,(7,7)),(8,(8,8))")
    ))
  }

  test("mapJoinPartitions V1") {
    testMapJoinPartitions(false)
  }

  test("mapJoinPartitions V2") {
    testMapJoinPartitions(true)
  }

  test("test multiZipRDDs") {
    val rdd1 = sc.makeRDD(Array(1, 2, 3, 4), 2)
    val rddList = List(rdd1, rdd1.map(_ + 10), rdd1.map(_ + 200))
    val zipped = VRDDFunctions.zipMultiRDDs(rddList) {
      iterList: List[Iterator[Int]] => new Iterator[Int]{
        override def hasNext: Boolean = iterList.map(_.hasNext).reduce(_ && _)
        override def next(): Int = iterList.map(_.next()).sum
      }
    }
    assert(zipped.glom().map(_.toList).collect().toList ===
      List(List(213, 216), List(219, 222)))
  }

  test("aggregateByKeyInMemory") {
    val rdd: RDD[(Int, Int)] = sc.makeRDD(Array(
      (1, 1), (2, 2), (3, 3),
      (1, 10), (2, 20), (3, 30)
    ), 3)
    import org.apache.spark.rdd.VPairRDDFunctions._
    val res = rdd.aggregateByKeyInMemory(new mutable.HashSet[Int], new HashPartitioner(3))(
      (u, v) => u += v,
      (u1, u2) => u1 ++= u2
    ).mapValues(_.toSet).collect()

    assert(res.sortBy(_._1) === Array(
      (1, Set(1, 10)),
      (2, Set(2, 20)),
      (3, Set(3, 30))
    ))
  }
} 
Example 26
Source File: GroupSorted.scala    From spark-sorted   with Apache License 2.0 5 votes vote down vote up
package com.tresata.spark.sorted.api.java

import java.util.{ Comparator, Iterator => JIterator }
import scala.reflect.ClassTag
import scala.collection.JavaConverters._

import org.apache.spark.{ Partitioner, HashPartitioner }
import org.apache.spark.Partitioner.defaultPartitioner
import org.apache.spark.api.java.JavaPairRDD
import org.apache.spark.api.java.function.{ Function => JFunction, Function2 => JFunction2, FlatMapFunction => JFlatMapFunction }

import com.tresata.spark.sorted.{ GroupSorted => SGroupSorted }

object GroupSorted {
  private case class ComparatorOrdering[T](comparator: Comparator[T]) extends Ordering[T] {
    def compare(x: T, y: T) = comparator.compare(x, y)
  }

  private def comparatorToOrdering[T](comparator: Comparator[T]): Ordering[T] = new ComparatorOrdering(comparator)

  private def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]]

  private implicit def ordering[K]: Ordering[K] = comparatorToOrdering(NaturalComparator.get[K])

  private def groupSort[K, V](javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]): SGroupSorted[K, V] = {
    implicit def kClassTag: ClassTag[K] = javaPairRDD.kClassTag
    implicit def vClassTag: ClassTag[V] = javaPairRDD.vClassTag
    val valueOrdering = Option(valueComparator).map(comparatorToOrdering)
    SGroupSorted(javaPairRDD.rdd, partitioner, valueOrdering)
  }
}

class GroupSorted[K, V] private (sGroupSorted: SGroupSorted[K, V]) extends JavaPairRDD[K, V](sGroupSorted)(GroupSorted.fakeClassTag[K], GroupSorted.fakeClassTag[V]) {
  def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]) =
    this(GroupSorted.groupSort(javaPairRDD, partitioner, valueComparator))

  def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner) =
    this(GroupSorted.groupSort(javaPairRDD, partitioner, null))

  def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int, valueComparator: Comparator[V]) =
    this(javaPairRDD, if (numPartitions > 0) new HashPartitioner(numPartitions) else defaultPartitioner(javaPairRDD.rdd), valueComparator)

  def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int) =
    this(javaPairRDD, numPartitions, null)

  def this(javaPairRDD: JavaPairRDD[K, V], valueComparator: Comparator[V]) =
    this(javaPairRDD, -1, valueComparator)

  def this(javaPairRDD: JavaPairRDD[K, V]) = this(javaPairRDD, -1, null)

  import GroupSorted._

  override def flatMapValues[W](f: JFlatMapFunction[V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.flatMapValues(v => f.call(v).asScala))
  }

  override def mapValues[W](f: JFunction[V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapValues(v => f.call(v)))
  }

  def mapKeyValuesToValues[W](f: JFunction[Tuple2[K, V], W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapKeyValuesToValues(kv => f.call(kv)))
  }

  def mapStreamByKey[W](f: JFunction[JIterator[V], JIterator[W]]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.mapStreamByKey(it => f.call(it.asJava).asScala))
  }

  def foldLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.foldLeftByKey(w)((w, v) => f.call(w, v)))
  }

  def reduceLeftByKey[W >: V](f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.reduceLeftByKey(f.call))
  }

  def scanLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = {
    implicit def wClassTag: ClassTag[W] = fakeClassTag[W]
    new GroupSorted[K, W](sGroupSorted.scanLeftByKey(w)((w, v) => f.call(w, v)))
  }
} 
Example 27
Source File: EdgeRDDImpl.scala    From graphx-algorithm   with GNU General Public License v2.0 5 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{OneToOneDependency, HashPartitioner, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import org.apache.spark.graphx._

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

} 
Example 28
Source File: TestJoins.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples

import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner }
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import scala.Iterator



object TestJoins {
	def main(args: Array[String]): Unit = {
		val sc = new SparkContext(new SparkConf().setAppName("TestJoinJob"))

		val x = sc.parallelize(List((1, 2), (1, 3), (2, 3), (2, 4))).partitionBy(new HashPartitioner(2)).cache
		val y = sc.parallelize(List((2, 5), (2, 6))).partitionBy(new HashPartitioner(2)).cache

		inspectRDD(x)
		inspectRDD(y)

		println(">>> joining x with y")
		val joinRDD = x.join(y).cache
		joinRDD.collect().foreach(println)
		inspectRDD(joinRDD)

		println(">>> left outer join of x with y")
		val leftJoin = x.leftOuterJoin(y).cache
		leftJoin.collect().foreach(println)
		inspectRDD(leftJoin)

		println(">>> right outer join of x with y")
		val rightJoin = x.rightOuterJoin(y).cache
		rightJoin.collect().foreach(println)
		inspectRDD(rightJoin)
	}
	
	def inspectRDD[T](rdd: RDD[T]): Unit = {
		
		println(">>> Partition length...")
		rdd.mapPartitions(f => Iterator(f.length), true).foreach(println)
		
		println(">>> Partition data...")
		rdd.foreachPartition(f => f.foreach(println))
	}
} 
Example 29
Source File: TestValueTransformations.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples

import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner }
import org.apache.spark.rdd.PairRDDFunctions

case class Customer(ID: Int, name: String)
case class Item(ID: Int, name: String, price: Float)
case class Order(ID: Int, item: Item, quantity: Int, var discount: Float)
case class CustomerOrders(cust: Customer, order: Order, offer: Boolean)

object TestValueTransformations {
	def main(args: Array[String]): Unit = {
		val sc = new SparkContext(new SparkConf().setAppName("TestCombineByKeyJob"))
		val rdd = sc.parallelize(
			List(
				CustomerOrders(Customer(1, "A"), Order(1, Item(1, "item_1", 20), 2, 0), false),
				CustomerOrders(Customer(1, "A"), Order(2, Item(2, "item_2", 10), 1, 0), false),
				CustomerOrders(Customer(2, "B"), Order(1, Item(1, "item_1", 20), 2, 0), true)))

		println(">>> List of customers availing offers")
		
		orderValuePerCustomer.foreach(println)

		println(">>> Total order value for customer ID = 1 is " + orderValuePerCustomer.reduceByKey(_ + _).lookup(1).toString())

	}
} 
Example 30
Source File: RDFS3.scala    From SparkSRE   with Apache License 2.0 5 votes vote down vote up
package com.hj.examples

import com.hj.constant.Const
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}

object RDFS3 {
  def main(args: Array[String]): Unit = {
    //Arguments: input/RDFS3.in output/RDFS3.out
    if(args.length != 2) {
      System.out.println("Arguments are invalid! \nExample: <input_path> <output_path>")
      System.exit(1)
    }
    val inputPath = args(0)
    val outputPath = args(1)

    val conf = new SparkConf().setAppName("RDFS3.in").setMaster("local[2]")
    val sc = new SparkContext(conf)

    val lines = sc.textFile(inputPath)  //"input/RDFS3.in"

    val triples = lines.map(x => {
      val arr = x.split(" ")
      (arr(0), arr(1), arr(2))
    })

    

    val partitioner = new HashPartitioner(2)

    val range = triples.filter(x => x._2.equals(Const.RDFS_RANGE)).map(x => (x._1, x._3))
    val pso = triples.map(x => (x._2, (x._1, x._3))).partitionBy(partitioner)
    val joined = pso.join(range)
    val res = joined.map(x => (x._2._1._2, x._2._2))

    res.foreach(x => println(x))
    res.saveAsTextFile(outputPath)
  }
} 
Example 31
Source File: RelationWithItemToItem.scala    From AI   with Apache License 2.0 5 votes vote down vote up
package com.bigchange.mllib

import breeze.numerics.{sqrt, pow}
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}



object RelationWithItemToItem {

  def main(args: Array[String]) {

    val sc = new SparkContext(new SparkConf()
      .setAppName("Item to Item")
      .setMaster("local"))
    // announce the top number of items to get
    val topK = 2

    val userItem = sc.textFile("/rating.dat")
      .map(_.split("\t")).map(x =>(x(0),x(1),x(2))).distinct().cache()
    // cal item -> (user,rating) and item -> sqrt(ratings)
    val itemUser = userItem.map(x => (x._2,(x._1,x._3.toDouble))).partitionBy(new HashPartitioner(20))
    // sqrt : 规整化 rating 的值
    val itemPowSqrt = userItem.map(x => (x._2,pow(x._3.toDouble,2.0))).reduceByKey(_+_).mapValues(x => sqrt(x))
    // cal item -> ((user,rating),sqrt(ratings)) => user -> (item,rating/sqrt(ratings))
    val userItemSqrt = itemUser.join(itemPowSqrt).map(x =>{
      val item = x._1
      val sqrtRatings = x._2._2
      val user = x._2._1._1
      val rating = x._2._1._2
      (user,(item,rating / sqrtRatings))
    })
    // cal the relation of item to item in user dimension => get the score of item to item which connection the relation of items
    val itemToItem = userItemSqrt.join(userItemSqrt).map(x =>{
      val item1 = x._2._1._1
      val rating1 = x._2._1._2
      val item2 = x._2._2._1
      val rating2 = x._2._2._2
      val score = rating1 * rating2
      if(item1 == item2){
        ((item1,item2),-1.0)
      }else{
        ((item1,item2),score)
      }
    })

    itemToItem.reduceByKey(_+_).map(x => (x._1._1,(x._1._2,x._2))).groupByKey().foreach(x => {
      val sourceItem = x._1
      val topItem = x._2.toList.filter(_._2 > 0).sortWith(_._2 > _._2).take(topK)
      println(s"item = $sourceItem,topK relative item list:$topItem")
    })
    sc.stop()
  }

} 
Example 32
Source File: EdgeRDDImpl.scala    From drizzle-spark   with Apache License 2.0 4 votes vote down vote up
package org.apache.spark.graphx.impl

import scala.reflect.{classTag, ClassTag}

import org.apache.spark.{HashPartitioner, OneToOneDependency}
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] (
    @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])],
    val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY)
  extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) {

  override def setName(_name: String): this.type = {
    if (partitionsRDD.name != null) {
      partitionsRDD.setName(partitionsRDD.name + ", " + _name)
    } else {
      partitionsRDD.setName(_name)
    }
    this
  }
  setName("EdgeRDD")

  
  override def count(): Long = {
    partitionsRDD.map(_._2.size.toLong).reduce(_ + _)
  }

  override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] =
    mapEdgePartitions((pid, part) => part.map(f))

  override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse)

  def filter(
      epred: EdgeTriplet[VD, ED] => Boolean,
      vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = {
    mapEdgePartitions((pid, part) => part.filter(epred, vpred))
  }

  override def innerJoin[ED2: ClassTag, ED3: ClassTag]
      (other: EdgeRDD[ED2])
      (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = {
    val ed2Tag = classTag[ED2]
    val ed3Tag = classTag[ED3]
    this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) {
      (thisIter, otherIter) =>
        val (pid, thisEPart) = thisIter.next()
        val (_, otherEPart) = otherIter.next()
        Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag)))
    })
  }

  def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag](
      f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = {
    this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter =>
      if (iter.hasNext) {
        val (pid, ep) = iter.next()
        Iterator(Tuple2(pid, f(pid, ep)))
      } else {
        Iterator.empty
      }
    }, preservesPartitioning = true))
  }

  private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag](
      partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = {
    new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel)
  }

  override private[graphx] def withTargetStorageLevel(
      targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = {
    new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel)
  }

}