org.apache.spark.graphx.Graph Scala Examples

The following examples show how to use org.apache.spark.graphx.Graph. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: OperatorsDSL.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 6 votes vote down vote up
package ml.sparkling.graph.operators

import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection._
import ml.sparkling.graph.api.operators.measures.{EdgeMeasure, VertexMeasureConfiguration}
import ml.sparkling.graph.operators.algorithms.coarsening.labelpropagation.LPCoarsening
import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN._
import ml.sparkling.graph.operators.algorithms.link.BasicLinkPredictor
import ml.sparkling.graph.operators.measures.edge.{AdamicAdar, CommonNeighbours}
import ml.sparkling.graph.operators.measures.vertex.{Degree, NeighborhoodConnectivity, VertexEmbeddedness}
import ml.sparkling.graph.operators.measures.vertex.clustering.LocalClustering
import ml.sparkling.graph.operators.measures.graph.{FreemanCentrality, Modularity}
import ml.sparkling.graph.operators.partitioning.CommunityBasedPartitioning._
import ml.sparkling.graph.operators.measures.vertex.closenes.Closeness
import ml.sparkling.graph.operators.measures.vertex.eigenvector.EigenvectorCentrality
import ml.sparkling.graph.operators.measures.vertex.hits.Hits
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


object OperatorsDSL {

  implicit class ModularityDSL[E:ClassTag](graph:Graph[ComponentID,E]){
    def modularity()=Modularity.compute(graph)
  }

  implicit class DSL[VD:ClassTag ,ED:ClassTag](graph:Graph[VD,ED]){
    def PSCAN(epsilon:Double=0.1)=
      computeConnectedComponents(graph,epsilon)

    def LPCoarse(treatAsUndirected:Boolean=false)=LPCoarsening.coarse(graph,treatAsUndirected = treatAsUndirected)

    def closenessCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      Closeness.compute(graph,vertexMeasureConfiguration)

    def eigenvectorCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      EigenvectorCentrality.compute(graph,vertexMeasureConfiguration)

    def hits(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      Hits.compute(graph,vertexMeasureConfiguration)

    def degreeCentrality(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      Degree.compute(graph,vertexMeasureConfiguration)

    def neighborhoodConnectivity(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      NeighborhoodConnectivity.compute(graph,vertexMeasureConfiguration)

    def vertexEmbeddedness(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      VertexEmbeddedness.compute(graph,vertexMeasureConfiguration)

    def localClustering(vertexMeasureConfiguration: VertexMeasureConfiguration[VD, ED]=VertexMeasureConfiguration())(implicit num:Numeric[ED])=
      LocalClustering.compute(graph,vertexMeasureConfiguration)

    def freemanCentrality()=FreemanCentrality.compute(graph)

    def partitionBy(communityDetectionMethod:CommunityDetectionMethod[VD,ED])(implicit sc:SparkContext)=
      partitionGraphBy(graph,communityDetectionMethod)

    def partitionBy(communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext)=
      partitionGraphUsing(graph,communityDetectionMethod,numParts)

    def adamicAdar(treatAsUndirected:Boolean=false)={
      AdamicAdar.computeWithPreprocessing(graph,treatAsUndirected)
    }

    def commonNeighbours(treatAsUndirected:Boolean=false)={
      CommonNeighbours.computeWithPreprocessing(graph,treatAsUndirected)
    }

    def predictLinks[EV: ClassTag, EO: ClassTag]( edgeMeasure: EdgeMeasure[EO, EV],threshold: EO,treatAsUndirected:Boolean=false)(implicit num: Numeric[EO]) = {
      BasicLinkPredictor.predictLinks(graph, edgeMeasure, threshold, treatAsUndirected)
    }
    }
} 
Example 2
Source File: GraphGeneration.scala    From Mastering-Machine-Learning-with-Spark-2.x   with MIT License 6 votes vote down vote up
package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.lib.TriangleCount
import org.apache.spark.graphx.util.GraphGenerators
import org.apache.spark.graphx.{Graph, GraphLoader, PartitionStrategy, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}


object GraphGeneration extends App {

  val conf = new SparkConf()
    .setAppName("Graph generation")
    .setMaster("local[4]")
  val sc = new SparkContext(conf)

  val edgeListGraph = GraphLoader.edgeListFile(sc, "./edge_list.txt")

  val rawEdges: RDD[(VertexId, VertexId)] = sc.textFile("./edge_list.txt").map {
    line =>
      val field = line.split(" ")
      (field(0).toLong, field(1).toLong)
  }
  val edgeTupleGraph = Graph.fromEdgeTuples(
    rawEdges=rawEdges, defaultValue="")

  val gridGraph = GraphGenerators.gridGraph(sc, 5, 5)
  val starGraph = GraphGenerators.starGraph(sc, 11)
  val logNormalGraph  = GraphGenerators.logNormalGraph(
    sc, numVertices = 20, mu=1, sigma = 3
  )
  logNormalGraph.outDegrees.map(_._2).collect().sorted

  val actorGraph = GraphLoader.edgeListFile(
    sc, "./ca-hollywood-2009.txt", true
  ).partitionBy(PartitionStrategy.RandomVertexCut)
  actorGraph.edges.count()

  val actorComponents = actorGraph.connectedComponents().cache
  actorComponents.vertices.map(_._2).distinct().count

  val clusterSizes =actorComponents.vertices.map(
    v => (v._2, 1)).reduceByKey(_ + _)
  clusterSizes.map(_._2).max
  clusterSizes.map(_._2).min

  val smallActorGraph = GraphLoader.edgeListFile(sc, "./ca-hollywood-2009.txt")
  val strongComponents = smallActorGraph.stronglyConnectedComponents(numIter = 5)
  strongComponents.vertices.map(_._2).distinct().count

  val canonicalGraph = actorGraph.mapEdges(e => 1).removeSelfEdges().convertToCanonicalEdges()
  val partitionedGraph = canonicalGraph.partitionBy(PartitionStrategy.RandomVertexCut)

  actorGraph.triangleCount()
  val triangles = TriangleCount.runPreCanonicalized(partitionedGraph)

  actorGraph.staticPageRank(10)
  val actorPrGraph: Graph[Double, Double] = actorGraph.pageRank(0.0001)
  actorPrGraph.vertices.reduce((v1, v2) => {
    if (v1._2 > v2._2) v1 else v2
  })

  actorPrGraph.inDegrees.filter(v => v._1 == 33024L).collect.foreach(println)

  actorPrGraph.inDegrees.map(_._2).collect().sorted.takeRight(10)

  actorPrGraph.inDegrees.map(_._2).filter(_ >= 62).count

} 
Example 3
Source File: LocalClustering.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 6 votes vote down vote up
package ml.sparkling.graph.operators.measures.vertex.clustering

import it.unimi.dsi.fastutil.longs.LongOpenHashSet
import ml.sparkling.graph.api.operators.measures.{VertexMeasure, VertexMeasureConfiguration}
import ml.sparkling.graph.operators.measures.utils.CollectionsUtils._
import ml.sparkling.graph.operators.measures.utils.{CollectionsUtils, NeighboursUtils}
import ml.sparkling.graph.operators.predicates.AllPathPredicate
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


   override def compute[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED],
                                                    vertexMeasureConfiguration: VertexMeasureConfiguration[VD,ED])
                                                   (implicit num: Numeric[ED]) = {
    val firstLevelNeighboursGraph = NeighboursUtils.getWithNeighbours(graph, vertexMeasureConfiguration.treatAsUndirected, AllPathPredicate)
    val localClusteringSums=firstLevelNeighboursGraph.aggregateMessages[Double](
      sendMsg=edgeContext=>{
      def messageCreator=(neighbours1:LongOpenHashSet,neighbours2:LongOpenHashSet)=>{
         intersectSize(neighbours1,neighbours2)
      }
      val message=messageCreator(edgeContext.srcAttr,edgeContext.dstAttr)
      edgeContext.sendToSrc(message)
      if(vertexMeasureConfiguration.treatAsUndirected){
      edgeContext.sendToDst(message)
      }
    },
    mergeMsg=(a,b)=>a+b)
    firstLevelNeighboursGraph.outerJoinVertices(localClusteringSums)((vId,oldValue,newValue)=>(newValue.getOrElse(0d),oldValue)).mapVertices {
      case (vId, (sum, neighbours)) => {
        val possibleConnections = neighbours.size * (neighbours.size - 1)
        if (possibleConnections == 0) 0d else sum / possibleConnections
      }
    }
  }
} 
Example 4
Source File: GraphFromGraphML$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.loaders.graphml

import ml.sparkling.graph.api.loaders.GraphLoading.LoadGraph
import ml.sparkling.graph.loaders.LoaderTest
import ml.sparkling.graph.loaders.graphml.GraphFromGraphML.{GraphML, GraphProperties}
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph



class GraphFromGraphML$Test(implicit sc:SparkContext)  extends LoaderTest {

  "GraphML with standard format" should "be loaded by default" in{
    Given("XML in GraphML format  path")
    val filePath = getClass.getResource("/simpleGraphML.xml").toString
    When("Loads graph")
    val graph = LoadGraph.from(GraphML(filePath)).load()
    Then("Graph should be loaded correctly")
    graph.vertices.count() should equal(2)
    graph.edges.count() should equal(1)
  }

  "GraphML with standard format and multiple edges" should "be loaded by default" in{
    Given("XML in GraphML format path")
    val filePath = getClass.getResource("/simpleGraphML2.xml").toString
    When("Loads graph")
    val graph = LoadGraph.from(GraphML(filePath)).load()
    Then("Graph should be loaded correctly")
    graph.vertices.count() should equal(3)
    graph.edges.count() should equal(2)
  }


  "GraphML with vertices attributes" should "be loaded by default" in{
    Given("XML in GraphML format  path")
    val filePath = getClass.getResource("/withValuesGraphML.xml").toString
    When("Loads graph")
    val graph: Graph[GraphProperties, GraphProperties] = LoadGraph.from(GraphML(filePath)).load()
    Then("Graph should be loaded correctly")
    graph.vertices.count() should equal(4)
    graph.edges.count() should equal(2)
    graph.vertices.map{
      case (vId,properites)=>(vId,properites("name").asInstanceOf[String])
    }.collect().sorted should equal(List((0l,"name0"),(1l,"name1"),(2l,"name2"),(3l,"name3")))
    graph.vertices.flatMap{
      case (vId,properites)=>properites.get("type").asInstanceOf[Option[String]].map((vId,_))
    }.collect().sorted should equal(List((0l,"type0")))
  }


} 
Example 5
Source File: Neo4jGraphFrame.scala    From neo4j-spark-connector   with Apache License 2.0 5 votes vote down vote up
package org.neo4j.spark.dataframe

import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.sql.SQLContext
import org.neo4j.spark.Neo4jGraph

import org.neo4j.spark.cypher.CypherHelpers._


object Neo4jGraphFrame {

  def apply(sqlContext: SQLContext, src: (String, String), edge: (String, String), dst: (String, String)) = {
    def nodeStmt(s: (String, String)) = s"MATCH (n:${s._1.quote}) RETURN id(n) as id, n.${s._2.quote} as prop"

    val edgeProp = if (edge._2 == null) "" else s", r.${edge._2.quote} as prop"
    val edgeStmt = s"MATCH (n:${src._1.quote})-[r:${edge._1.quote}]->(m:${dst._1.quote}) RETURN id(n) as src, id(m) as dst" + edgeProp

    val vertices1 = Neo4jDataFrame(sqlContext, nodeStmt(src), Seq.empty, ("id", "integer"), ("prop", "string"))
    val vertices2 = Neo4jDataFrame(sqlContext, nodeStmt(dst), Seq.empty, ("id", "integer"), ("prop", "string"))
    val schema = Seq(("src", "integer"), ("dst", "integer")) ++ (if (edge._2 != null) Some("prop", "string") else None)
    val edges = Neo4jDataFrame(sqlContext, edgeStmt, Seq.empty, schema: _*)

    org.graphframes.GraphFrame(vertices1.union(vertices2).distinct(), edges)
  }

  def fromGraphX(sc: SparkContext, label1: String, rels: Seq[String], label2: String) = {
    val g: Graph[Any, Int] = Neo4jGraph.loadGraph(sc, label1, rels, label2)
    org.graphframes.GraphFrame.fromGraphX(g)
  }

  def fromEdges(sqlContext: SQLContext, label1: String, rels: Seq[String], label2: String) = {
    val relTypes = rels.map(_.quote).mkString("|")
    val edgeStmt = s"MATCH (n:${label1.quote})-[r:$relTypes]->(m:${label2.quote}) RETURN id(n) as src, id(m) as dst"
    val edges = Neo4jDataFrame(sqlContext, edgeStmt, Seq.empty, ("src", "integer"), ("dst", "integer"))
    org.graphframes.GraphFrame.fromEdges(edges)
  }
} 
Example 6
Source File: LoadDsl.scala    From neo4j-spark-connector   with Apache License 2.0 5 votes vote down vote up
package org.neo4j.spark.dsl

import org.apache.spark.graphx.Graph
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.graphframes.GraphFrame

import scala.reflect.ClassTag

trait LoadDsl {
  def loadRdd[T:ClassTag] : RDD[T]
  def loadRowRdd : RDD[Row]
  def loadNodeRdds : RDD[Row]
  def loadRelRdd : RDD[Row]
  def loadGraph[VD:ClassTag,ED:ClassTag] : Graph[VD,ED]
  def loadGraphFrame[VD:ClassTag,ED:ClassTag] : GraphFrame
  def loadDataFrame : DataFrame
  def loadDataFrame(schema : (String,String)*) : DataFrame
} 
Example 7
Source File: PairwiseBPSuite.scala    From sandpiper   with Apache License 2.0 5 votes vote down vote up
package sparkle.graph

import org.apache.spark.graphx.{Edge, Graph}
import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite
import sparkle.util.LocalSparkContext

class PairwiseBPSuite  extends FunSuite with LocalSparkContext {

  test("Pairwise BP test") {
    // test from the lectures EECS course 6.869, Bill Freeman and Antonio Torralba.
    // Chapter 7.3.5 Numerical example.

    withSpark { sc =>
      val vertices: RDD[(Long, PVertex)] = sc.parallelize(Seq(
        (1L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))),
        (2L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))),
        (3L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))),
        (4L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 0.0).map(math.log)))))
      )
      val edges = sc.parallelize(Seq(
        Edge(1L, 2L, PEdge(Factor(Array(2, 2), Array(1.0, 0.9, 0.9, 1.0).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0)))),
        Edge(2L, 3L, PEdge(Factor(Array(2, 2), Array(0.1, 1.0, 1.0, 0.1).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0)))),
        Edge(2L, 4L, PEdge(Factor(Array(2, 2), Array(1.0, 0.1, 0.1, 1.0).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0))))
      ))
      val graph = Graph(vertices, edges)
      val bpGraph = PairwiseBP(graph)
      val trueProbabilities = Seq(
        1L -> (1.0 / 2.09 * 1.09, 1.0 / 2.09 * 1.0),
        2L -> (1.0 / 1.1 * 1.0, 1.0 / 1.1 * 0.1),
        3L -> (1.0 / 1.21 * 0.2, 1.0 / 1.21 * 1.01),
        4L -> (1.0, 0.0)).sortBy { case (vid, _) => vid }
      val calculatedProbabilities = bpGraph.vertices.collect().sortBy { case (vid, _) => vid }
      val eps = 10e-5
      calculatedProbabilities.zip(trueProbabilities).foreach {
        case ((_, vertex), (_, (trueP0, trueP1))) =>
          assert(trueP0 - vertex.belief.exp().cloneValues(0) < eps && trueP1 - vertex.belief.exp().cloneValues(1) < eps)
      }
    }

  }

  test("Pariwise BP test with file") {
    withSpark { sc =>
      val graph = PairwiseBP.loadPairwiseGraph(sc, "data/vertex4.txt", "data/edge4.txt")
      val bpGraph = PairwiseBP(graph)
      val trueProbabilities = Seq(
        1L -> (1.0 / 2.09 * 1.09, 1.0 / 2.09 * 1.0),
        2L -> (1.0 / 1.1 * 1.0, 1.0 / 1.1 * 0.1),
        3L -> (1.0 / 1.21 * 0.2, 1.0 / 1.21 * 1.01),
        4L -> (1.0, 0.0)).sortBy { case (vid, _) => vid }
      val calculatedProbabilities = bpGraph.vertices.collect().sortBy { case (vid, _) => vid }
      val eps = 10e-5
      calculatedProbabilities.zip(trueProbabilities).foreach {
        case ((_, vertex), (_, (trueP0, trueP1))) =>
          assert(trueP0 - vertex.belief.exp().cloneValues(0) < eps && trueP1 - vertex.belief.exp().cloneValues(1) < eps)
      }
    }
  }
} 
Example 8
Source File: FastUnfolding.scala    From fastunfolding   with Apache License 2.0 5 votes vote down vote up
package com.soteradefense.dga.graphx.louvain

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{VertexId, PartitionStrategy, TripletFields, Graph}

import scala.reflect.ClassTag


class FastUnfolding(outputdir: String,
                    minProgress: Int = 1,
                    progressCounter: Int = 1) {

    var qValues = Array[(Int, Double)]()

    def saveLevel(sc: SparkContext,
                  level: Int,
                  q: Double,
                  graph: Graph[MyVertexState, Long]) = {
        graph.vertices.saveAsTextFile(s"${outputdir}/level_${level}_vertices")
        graph.edges.saveAsTextFile(s"${outputdir}/level_${level}_edges")
        //graph.vertices.map( {case (id,v) => ""+id+","+v.internalWeight+","+v.community }).saveAsTextFile(outputdir+"/level_"+level+"_vertices")
        //graph.edges.mapValues({case e=>""+e.srcId+","+e.dstId+","+e.attr}).saveAsTextFile(outputdir+"/level_"+level+"_edges")
        qValues = qValues :+ ((level, q))
        println(s"qValue: $q")

        // overwrite the q values at each level
        sc.parallelize(qValues, 1).saveAsTextFile(s"${outputdir}/qvalues")
    }

    def run[VD: ClassTag](sc: SparkContext, graph: Graph[VD, Long]) = {
        val initialGraph = createGraph(graph)

        val graphWeight = initialGraph.vertices.map(
            vertex => {
                vertex._2.nodeWeight
            }
        ).reduce(_ + _)

        val broadcastGraphWeight = sc.broadcast(graphWeight)

        val initialModularity = initialGraph.vertices.map(
            vertex => {
                vertex._2.in / (2 * graphWeight) - vertex._2.tot * vertex._2.tot / (graphWeight * graphWeight)
            }
        ).reduce(_ + _)

        var level = -1
        var halt = false

        while(!halt) {
            level += 1
            println(s"Starting level ${level}")

            val (currentQ, currentGraph, passes) = runFastUnfolding(sc, initialGraph, minProgress, progressCounter)


        }
    }

    def runFastUnfolding(sc: SparkContext,
                        graph: Graph[MyVertexState, Long],
                        minProgress: Int,
                        progressCounter: Int) = {
        val cachedGraph = graph.cache()
        

    }

    def createGraph[VD: ClassTag](graph: Graph[VD, Long]): Graph[MyVertexState, Long] = {
        val nodeWeights = graph.aggregateMessages[Long](
            cxt => {
                cxt.sendToSrc(cxt.attr)
                cxt.sendToDst(cxt.attr)
            },
            (a, b) => a + b,
            TripletFields.EdgeOnly
        )

        nodeWeights.foreach(result => println(s"nodeweight: ${result._1}, ${result._2}"))


        val louvainGraph = graph.outerJoinVertices(nodeWeights)((vid, data, weightOption) => {
            val weight = weightOption.getOrElse(0L)
            val state = new MyVertexState()
            state.community = vid
            state.changed = false
            state.tot = weight
            state.in = 0
            state.nodeWeight = weight
            state
        }).partitionBy(PartitionStrategy.EdgePartition2D)

        louvainGraph
    }
} 
Example 9
Source File: PeriodicGraphCheckpointer.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.graphx.util

import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.storage.StorageLevel
import org.apache.spark.util.PeriodicCheckpointer



      data.vertices.cache()
    }
    if (data.edges.getStorageLevel == StorageLevel.NONE) {
      data.edges.cache()
    }
  }

  override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false)

  override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = {
    data.getCheckpointFiles
  }
} 
Example 10
Source File: SSSPExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.graphx

// $example on$
import org.apache.spark.graphx.{Graph, VertexId}
import org.apache.spark.graphx.util.GraphGenerators
// $example off$
import org.apache.spark.sql.SparkSession


object SSSPExample {
  def main(args: Array[String]): Unit = {
    // Creates a SparkSession.
    val spark = SparkSession
      .builder
      .appName(s"${this.getClass.getSimpleName}")
      .getOrCreate()
    val sc = spark.sparkContext

    // $example on$
    // A graph with edge attributes containing distances
    val graph: Graph[Long, Double] =
      GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble)
    val sourceId: VertexId = 42 // The ultimate source
    // Initialize the graph such that all vertices except the root have distance infinity.
    val initialGraph = graph.mapVertices((id, _) =>
        if (id == sourceId) 0.0 else Double.PositiveInfinity)
    val sssp = initialGraph.pregel(Double.PositiveInfinity)(
      (id, dist, newDist) => math.min(dist, newDist), // Vertex Program
      triplet => {  // Send Message
        if (triplet.srcAttr + triplet.attr < triplet.dstAttr) {
          Iterator((triplet.dstId, triplet.srcAttr + triplet.attr))
        } else {
          Iterator.empty
        }
      },
      (a, b) => math.min(a, b) // Merge Message
    )
    println(sssp.vertices.collect.mkString("\n"))
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 11
Source File: RingGenerator.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.generators.ring

import ml.sparkling.graph.api.generators.{GraphGenerator, GraphGeneratorConfiguration}
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.rdd.RDD


object RingGenerator  extends GraphGenerator[RingGeneratorConfiguration,Int,Int]{
  override def generate(configuration: RingGeneratorConfiguration)(implicit ctx:SparkContext): Graph[Int, Int] = {
    val vertexTuples: RDD[(Long, Long)] =ctx
      .parallelize((0l to configuration.numberOfNodes-1))
      .flatMap(vId=>{
        val nextId=(vId+1) % configuration.numberOfNodes
        val previousId=if(vId-1 < 0) {configuration.numberOfNodes-1} else {vId-1}
        (vId,nextId) :: {if(configuration.undirected) List((vId,previousId)) else Nil}
      }
      )
    Graph.fromEdgeTuples(vertexTuples,1)
  }
}
case class RingGeneratorConfiguration(val numberOfNodes:Long, val undirected:Boolean=false) extends GraphGeneratorConfiguration; 
Example 12
Source File: Neo4jGraphScalaTSE.scala    From neo4j-spark-connector   with Apache License 2.0 5 votes vote down vote up
package org.neo4j.spark

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.junit.Assert._
import org.junit._

import scala.collection.JavaConverters._

object Neo4jGraphScalaTSE {

}


class Neo4jGraphScalaTSE extends SparkConnectorScalaBaseTSE {
  val FIXTURE: String = "CREATE (s:A {a:0})-[r:REL {foo:'bar'}]->(t:B {b:1}) RETURN id(s) AS source, id(t) AS target"

  private var source: Long = _
  private var target: Long = _

  @Before
  @throws[Exception]
  def setUp {
    val map = SparkConnectorScalaSuiteIT.session().run(FIXTURE).single()
      .asMap()
    source = map.get("source").asInstanceOf[Long]
    target = map.get("target").asInstanceOf[Long]
  }

  private def assertGraph(graph: Graph[_, _], expectedNodes: Long, expectedRels: Long) = {
    assertEquals(expectedNodes, graph.vertices.count)
    assertEquals(expectedRels, graph.edges.count)
  }

  @Test def runCypherQueryWithParams {
    val data = List(Map("id"->1,"name"->"Test").asJava).asJava
    Executor.execute(sc, "UNWIND $data as row CREATE (n:Test {id:row.id}) SET n.name = row.name", Map(("data",data)))
  }
  @Test def runMatrixQuery {
    val graph = Neo4jGraph.loadGraph(sc, "A", Seq.empty, "B")
    assertGraph(graph, 2, 1)
  }

  @Test def saveGraph {
    val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(source,target,42L)))
    val graph = Graph.fromEdges(edges,-1)
    assertGraph(graph, 2, 1)
    Neo4jGraph.saveGraph(sc,graph,null,("REL","test"))
    assertEquals(42L, SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL]->(:B) RETURN rel.test as prop").single().get("prop").asLong())
  }

  @Test def saveGraphMerge {
    val edges : RDD[Edge[Long]] = sc.makeRDD(Seq(Edge(source,target,42L)))
    val graph = Graph.fromEdges(edges,13L)
    assertGraph(graph, 2, 1)
    Neo4jGraph.saveGraph(sc,graph,"value",("FOOBAR","test"),Option("Foo","id"),Option("Bar","id"),merge = true)
    assertEquals(Map("fid"->source,"bid"->target,"rv"->42L,"fv"->13L,"bv"->13L).asJava,SparkConnectorScalaSuiteIT.session().run("MATCH (foo:Foo)-[rel:FOOBAR]->(bar:Bar) RETURN {fid: foo.id, fv:foo.value, rv:rel.test,bid:bar.id,bv:bar.value} as data").single().get("data").asMap())
  }
  @Test def saveGraphByNodeLabel {
    val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(0,1,42L)))
    val graph = Graph.fromEdges(edges,-1)
    assertGraph(graph, 2, 1)
    Neo4jGraph.saveGraph(sc,graph,null,("REL","test"),Option(("A","a")),Option(("B","b")))
    assertEquals(42L,SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL]->(:B) RETURN rel.test as prop").single().get("prop").asLong())
  }
  @Test def mergeGraphByNodeLabel {
    val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(source,target,42L)))
    val graph = Graph.fromEdges(edges,-1)
    assertGraph(graph, 2, 1)
    Neo4jGraph.saveGraph(sc,graph,null,("REL2","test"),merge = true)
    assertEquals(42L,SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL2]->(:B) RETURN rel.test as prop").single().get("prop").asLong())
  }

  @Test def saveGraphNodes {
    val nodes : RDD[(VertexId, Long)] = sc.makeRDD(Seq((source,10L),(target,20L)))
    val edges : RDD[Edge[Long]] = sc.makeRDD(Seq())
    val graph = Graph[Long,Long](nodes,edges,-1)
    assertGraph(graph, 2, 0)
    Neo4jGraph.saveGraph(sc,graph,"prop")
    assertEquals(10L,SparkConnectorScalaSuiteIT.session().run(s"MATCH (a:A) WHERE id(a) = $source RETURN a.prop as prop").single().get("prop").asLong())
    assertEquals(20L,SparkConnectorScalaSuiteIT.session().run(s"MATCH (b:B) WHERE id(b) = $target RETURN b.prop as prop").single().get("prop").asLong())
  }
} 
Example 13
Source File: GraphProviders.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.loaders.csv.providers

import ml.sparkling.graph.loaders.csv.types.Types
import ml.sparkling.graph.loaders.csv.types.Types.ToVertexId
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SparkSession;
import scala.reflect.ClassTag


object GraphProviders {
  val defaultStorageLevel=StorageLevel.MEMORY_ONLY
  def simpleGraphBuilder[VD: ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None,
                                                     vertexProvider: Row => Seq[(VertexId, VD)],
                                                     edgeProvider: Row => Seq[Edge[ED]],
                                                     edgeStorageLevel: StorageLevel = defaultStorageLevel,
                                                     vertexStorageLevel: StorageLevel =defaultStorageLevel)
                                                    (dataFrame: DataFrame): Graph[VD, ED] = {

    def mapRows[MT: ClassTag](mappingFunction: (Row) => Seq[MT]): RDD[MT] = {
      dataFrame.rdd.mapPartitionsWithIndex((id, rowIterator) => {
        rowIterator.flatMap { case row => mappingFunction(row) }
      })
    }

    val vertices: RDD[(VertexId, VD)] = mapRows(vertexProvider)
    val edges: RDD[Edge[ED]] = mapRows(edgeProvider)
    defaultVertex match{
      case None => Graph(vertices,edges,edgeStorageLevel=edgeStorageLevel,vertexStorageLevel=vertexStorageLevel)
      case Some(defaultVertexValue)=> Graph(vertices,edges,defaultVertexValue,edgeStorageLevel,vertexStorageLevel)
    }

  }

  def indexedGraphBuilder[VD:ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None,
                                                      vertexProvider: (Row, ToVertexId[VD]) => Seq[(VertexId, VD)],
                                                      edgeProvider: (Row, ToVertexId[VD]) => Seq[Edge[ED]],
                                                      columnsToIndex: Seq[Int],
                                                      edgeStorageLevel: StorageLevel = defaultStorageLevel,
                                                      vertexStorageLevel: StorageLevel = defaultStorageLevel)
                                                     (dataFrame: DataFrame): Graph[VD, ED] = {
    val index = dataFrame.rdd.flatMap(row => columnsToIndex.map(row(_))).distinct().zipWithUniqueId().collect().toMap
    def extractIdFromIndex(vertex: VD) = index(vertex)
    simpleGraphBuilder(defaultVertex,
      vertexProvider(_: Row, extractIdFromIndex _),
      edgeProvider(_: Row, extractIdFromIndex _),
      edgeStorageLevel,
      vertexStorageLevel)(dataFrame)

  }
} 
Example 14
Source File: GraphMLLoader.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.loaders.graphml

import com.databricks.spark.xml._
import ml.sparkling.graph.loaders.graphml.GraphMLFormat._
import ml.sparkling.graph.loaders.graphml.GraphMLTypes.TypeHandler
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext, SparkSession}

import scala.collection.mutable
import scala.util.Try


  def loadGraphFromML(path: String)(implicit sc: SparkContext): Graph[ValuesMap, ValuesMap] = {
    val sparkSession=SparkSession.builder().getOrCreate();

    val graphDataFrame = sparkSession.sqlContext.read
      .format("com.databricks.spark.xml")
      .option("attributePrefix","@")
      .option("valueTag","#VALUE")
      .option("rowTag",graphTag).load(path).rdd

    val keys =sparkSession.sqlContext.read
      .format("com.databricks.spark.xml")
      .option("attributePrefix","@")
      .option("valueTag","#VALUE")
      .option("rowTag",graphMLTag).load(path).rdd
      .flatMap(r => Try(r.getAs[mutable.WrappedArray[Row]](keyTag).toArray).getOrElse(Array.empty))

    val nodesKeys = keys
      .filter(r => r.getAs[String](forAttribute) == nodeTag)
    val edgeKeys = keys
      .filter(r => r.getAs[String](forAttribute) == edgeTag)

    val nodeAttrHandlers = createAttrHandlersFor(nodesKeys)
    val edgeAttrHandlers = createAttrHandlersFor(edgeKeys)

    val verticesWithData = graphDataFrame.flatMap(r => r.getAs[Any](nodeTag) match {
      case data: mutable.WrappedArray[Row@unchecked] => data.array
      case data: Row => Array(data)
    })

    val verticesIndex = verticesWithData.map(r => r.getAs[String](idAttribute)).zipWithUniqueId().collect().toMap

    val vertices: RDD[(VertexId, Map[String, Any])] = verticesWithData
      .map(
        r => (verticesIndex(r.getAs[String](idAttribute)), extractAttributesMap(nodeAttrHandlers, r))
      )

    val edgesRows = graphDataFrame.flatMap(r => r.getAs[Any](edgeTag) match {
      case data: mutable.WrappedArray[Row@unchecked] => data.array
      case data: Row => Array(data)
    })
      .map(r => Edge(
        verticesIndex(r.getAs[String](sourceAttribute)),
        verticesIndex(r.getAs[String](targetAttribute)),
        extractAttributesMap(edgeAttrHandlers, r)
      ))
    Graph(vertices, edgesRows)
  }

  def extractAttributesMap(attrHandlers: Map[String, GraphMLAttribute], r: Row): Map[String, Any] = {
    Try(r.getAs[mutable.WrappedArray[Row]](dataTag)).toOption.map(
      _.map(r => {
        val attribute = attrHandlers(r.getAs[String](keyAttribute))
        (attribute.name, attribute.handler(r.getAs[String](tagValue)))
      }).toMap
    ).getOrElse(Map.empty) + ("id" -> r.getAs[String](idAttribute))
  }

  def createAttrHandlersFor(keys: RDD[Row]): Map[String, GraphMLAttribute] = {
    keys
      .map(r => (r.getAs[String](idAttribute), GraphMLAttribute(r.getAs[String](nameAttribute), GraphMLTypes(r.getAs[String](typeAttribute)))))
      .collect().toMap
  }
} 
Example 15
Source File: PSCAN$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.algorithms.community.pscan

import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.ComponentID
import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._
import org.apache.spark.graphx.util.GraphGenerators

class PSCAN$Test (implicit sc:SparkContext)   extends MeasureTest {

  "Components for full graph" should  " be computed" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes components")
    val components: Graph[ComponentID, Int] = PSCAN.computeConnectedComponents(graph)
    Then("Should compute components correctly")
    components.vertices.map{case (vId,cId)=>cId}.distinct().collect().size  should equal (1)
    graph.unpersist(true)
  }


  "Components for full graph" should  " be computed using DSL" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes components")
    val components: Graph[ComponentID, Int] =graph.PSCAN()
    Then("Should compute components correctly")
    components.vertices.map{case (vId,cId)=>cId}.distinct().collect().size  should equal (1)
    graph.unpersist(true)
  }

  "Components for ring graph" should  " be computed" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes components")
    val components: Graph[ComponentID, Int] = PSCAN.computeConnectedComponents(graph)
    Then("Should compute components correctly")
    components.vertices.map{case (vId,cId)=>cId}.distinct().collect().size  should equal (5)
    graph.unpersist(true)
  }
  "Components for 3 component graph" should  " be computed" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/coarsening_to_3")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes components")
    val components: Graph[ComponentID, Int] = PSCAN.computeConnectedComponents(graph)
    Then("Should compute components correctly")
    components.vertices.map{case (vId,cId)=>cId}.distinct().collect().size  should equal (3)
    graph.unpersist(true)
  }

  "Dynamic components detection for 3 component graph" should  " be computed" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/coarsening_to_3")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes components")
    val (_,numberOfComponents)= PSCAN.computeConnectedComponentsUsing(graph,3)
    Then("Should compute components correctly")
    numberOfComponents should equal (3)
    graph.unpersist(true)
  }

  "Dynamic components detection  for RMAT graph" should  " be computed" in{
    for(x<- 0 to 10){
      Given("graph")
      val graph:Graph[Int,Int]=GraphGenerators.rmatGraph(sc,33,132)
      When("Computes components")
      val (_,numberOfComponents)= PSCAN.computeConnectedComponentsUsing(graph,24)
      Then("Should compute components correctly")
      numberOfComponents  should equal (24l +- 5l)
      graph.unpersist(true)
    }
  }

  "Dynamic components detection  for random graph" should  " be computed" in{
    Given("graph")
    val graph:Graph[Int,Int]=GraphGenerators.rmatGraph(sc,1000,10000)
    When("Computes components")
    val (_,numberOfComponents)= PSCAN.computeConnectedComponentsUsing(graph,24)
    Then("Should compute components correctly")
    numberOfComponents  should equal (24l +- 5l)
    graph.unpersist(true)
  }

} 
Example 16
Source File: BasicLinkPredictor$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.algorithms.link

import ml.sparkling.graph.operators.MeasureTest
import ml.sparkling.graph.operators.measures.edge.CommonNeighbours
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._


class BasicLinkPredictor$Test (implicit sc:SparkContext) extends MeasureTest {

  "In open triad" should  " propose to close it" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/3_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes new links")
    val links = BasicLinkPredictor.predictLinks(graph,CommonNeighbours,0,true)
    Then("Should compute links correctly")
   links.collect() should equal(Array((1,3)))
    graph.unpersist(true)
  }

  "In open 4 nodes graph" should  " propose to close it fully" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_open")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes new links")
    val links = graph.predictLinks(CommonNeighbours,1,true)
    Then("Should compute links correctly")
    links.collect().toSet should equal(Set((1,3),(2,4)))
    graph.unpersist(true)
  }
} 
Example 17
Source File: MeasureTest.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Graph, GraphLoader}
import org.scalatest._


abstract class MeasureTest(implicit sc:SparkContext)  extends FlatSpec with BeforeAndAfterAll with GivenWhenThen with Matchers{
  def time[T](str: String)(thunk: => T): (T,Long) = {
    logger.info(s"$str...")
    val t1 = System.currentTimeMillis
    val x = thunk
    val t2 = System.currentTimeMillis
    val diff=t2 - t1
    logger.info(s"$diff ms")
    (x,diff)
  }

  val logger=Logger.getLogger(this.getClass)

  def loadGraph(file:String)={
    val out: Graph[Int, Int] =GraphLoader.edgeListFile(sc,file.toString)
    out.vertices.setName(s"Graph vertices ${file}")
    out.edges.setName(s"Graph edges ${file}")
    out.triplets.setName(s"Graph triplets ${file}")
    out
  }

} 
Example 18
Source File: Modularity$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.measures.graph

import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._
import org.apache.spark.graphx.util.GraphGenerators


class Modularity$Test (implicit sc:SparkContext)   extends MeasureTest{

  "Modularity  for star graph in one community" should "be 0" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/6_nodes_star")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    val graphComponents=graph.PSCAN(epsilon = 0)
    When("Computes Modularity")
    val result=Modularity.compute(graphComponents)
    Then("Should calculate Modularity")
    result should be (0)
    graph.unpersist(true)
  }


  "Modularity  for ring graph in one  community" should "be 0" in{
    Given("graph")
    val graph=GraphGenerators.gridGraph(sc,5,5).mapEdges((_)=>1).mapVertices((_,_)=>1)
    val graphComponents=graph.PSCAN(epsilon = 0)
    When("Computes Modularity")
    val result=Modularity.compute(graphComponents)
    Then("Should calculate Modularity")
    result should be (0)
    graph.unpersist(true)
  }

  "Modularity  for ring graph in one node communities" should "be -0.041875" in{
    Given("graph")
    val graph=GraphGenerators.gridGraph(sc,5,5)
    val graphComponents=graph.PSCAN(epsilon = 1)
    When("Computes Modularity")
    val result=Modularity.compute(graphComponents)
    Then("Should calculate Modularity")
    result should be (-0.041875 +- 0.000000001)
    graph.unpersist(true)
  }

  "Modularity  for star graph in one community" should "be 0 when calculated using DSL" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/6_nodes_star")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    val graphComponents=graph.PSCAN(epsilon = 0)
    When("Computes Modularity")
    val result=graphComponents.modularity()
    Then("Should calculate Modularity")
    result should be (0)
    graph.unpersist(true)
  }

  "Modularity  for all single components" should "be -1 " in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/6_nodes_star")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    val graphComponents=graph.PSCAN(epsilon=1)
    When("Computes Modularity")
    val result=graphComponents.modularity()
    Then("Should calculate Modularity")
    result should be (-0.3 +- 0.000000001)
    graph.unpersist(true)
  }


} 
Example 19
Source File: FreemanCentrality$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.measures.graph

import ml.sparkling.graph.operators.MeasureTest
import ml.sparkling.graph.operators.OperatorsDSL._
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph

class FreemanCentrality$Test (implicit sc:SparkContext)   extends MeasureTest  {

  "Freeman Centrality  for star graph" should "be 1" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/6_nodes_star")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes Freemans Centrality")
    val result=FreemanCentrality.compute(graph)
    Then("Should calculate Freemans Centrality")
    result should be (1)
    graph.unpersist(true)
  }

  "Freeman Centrality  for star graph" should "be 1 when calculated using DSL" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/6_nodes_star")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes Freemans Centrality")
    val result=graph.freemanCentrality()
    Then("Should calculate Freemans Centrality")
    result should be (1)
    graph.unpersist(true)
  }


  "Freeman Centrality  for 5 node line graph" should "be 0.167" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes Freemans Centrality")
    val result=FreemanCentrality.compute(graph)
    Then("Should calculate Freemans Centrality")
    result should be (0.16666666 +- 1e-5)
    graph.unpersist(true)
  }


} 
Example 20
Source File: Hits$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.measures.vertex.hits

import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

class Hits$Test(implicit sc:SparkContext)  extends MeasureTest  {



  "Hits  for line graph" should "be correctly calculated" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes Hits")
    val result = Hits.computeBasic(graph)
    Then("Should calculate hits correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array(
      (0.25,0d), (0.25,0.25),(0.25,0.25),(0.25,0.25),(0d,0.25)
    )).foreach {
      case ((a,b),(c,d)) => {
        a should be (c +- 1e-5)
        b should be (d +- 1e-5)
      }
    }
    graph.unpersist(true)
  }

  "Hits  for line graph" should "be correctly calculated using DSL" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes Hits")
    val result = graph.hits()
    Then("Should calculate hits correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array(
      (0.25,0d), (0.25,0.25),(0.25,0.25),(0.25,0.25),(0d,0.25)
    )).foreach {
      case ((a,b),(c,d)) => {
        a should be (c +- 1e-5)
        b should be (d +- 1e-5)
      }
    }
    graph.unpersist(true)
  }

  "Hits for full 4 node directed graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes Hits")
    val result=Hits.computeBasic(graph)
    Then("Should calculate Hits correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array(
      (0.44504187450168503,0.19806226306818242),
      (0.19806226497496957,0.4450418674109515),
      (1.9336832073590722e-13,0.3568958695205176),
      (0.35689586676523016,3.484376742610991e-13)
    )).foreach {
      case ((a,b),(c,d)) => {
        a should be (c +- 1e-5)
        b should be (d +- 1e-5)
      }
    }
    graph.unpersist(true)
  }



} 
Example 21
Source File: Neo4jPersistence.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.persistence

import java.io.{File, PrintWriter}

import edu.msstate.dasi.csb.model.{EdgeData, VertexData}
import edu.msstate.dasi.csb.util.Util
import org.apache.hadoop.fs.FileUtil
import org.apache.spark.graphx.Graph

object Neo4jPersistence extends GraphPersistence {
  private val vertices_suffix = "_nodes"
  private val edges_suffix = "_relationships"

  
  def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite :Boolean = false): Unit = {
    val verticesPath = graphName + vertices_suffix
    val verticesTmpPath = "__" + verticesPath
    val edgesPath = graphName + edges_suffix
    val edgesTmpPath = "__" + edgesPath

    if (overwrite) {
      FileUtil.fullyDelete(new File(verticesPath + "-header"))
      FileUtil.fullyDelete(new File(verticesPath))
      FileUtil.fullyDelete(new File(edgesPath + "-header"))
      FileUtil.fullyDelete(new File(edgesPath))
    }

    val nodeHeader = s"name:ID($graphName),:LABEL\n"

    val nodeHeaderWriter = new PrintWriter(new File(verticesPath + "-header"))
    nodeHeaderWriter.write(nodeHeader)
    nodeHeaderWriter.close()

    graph.vertices.map {
      case (id, _) => s"$id,$graphName"
    }.saveAsTextFile(verticesTmpPath)

    Util.merge(verticesTmpPath, verticesPath)
    FileUtil.fullyDelete(new File(verticesTmpPath))

    val relationshipHeader = s":START_ID($graphName),:END_ID($graphName),:TYPE,${EdgeData.neo4jCsvHeader}\n"

    val relHeaderWriter = new PrintWriter(new File(edgesPath + "-header"))
    relHeaderWriter.write(relationshipHeader)
    relHeaderWriter.close()

    graph.edges.map(edge =>
      edge.attr match {
        case edgeData: EdgeData => s"${edge.srcId},${edge.dstId},EDGE,${edgeData.toCsv}"
        case _ => s"${edge.srcId},${edge.dstId},EDGE"
      }
    ).saveAsTextFile(edgesTmpPath)

    Util.merge(edgesTmpPath, edgesPath)
    FileUtil.fullyDelete(new File(edgesTmpPath))
  }
} 
Example 22
Source File: ClosenessCentrality.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.workload.spark

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.{EdgeDirection, Graph, VertexId}

import scala.collection.mutable
import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    getClosenessOfVert(vertex, graph)
  }

  private class DistanceNodePair(var distance: Long, var totalPairs: Long) extends Comparable[DistanceNodePair] {

    override def compareTo(dp: DistanceNodePair): Int = (this.distance - dp.distance).toInt
  }

  private class NodeVisitCounter extends java.io.Serializable {

    var totalPairs: Long = _

    var levelSize: mutable.HashMap[Long, Long] = _ //first is distance second is pair at that distance
  }

  private def BFSNode[VD: ClassTag, ED: ClassTag](nID: Long, graph: Graph[VD, ED]): NodeVisitCounter = {
    val q = new mutable.Queue[Long]()
    q.enqueue(nID)
    val visited = new mutable.HashSet[VertexId]()
    val levelSize = new mutable.HashMap[Long, Long]()
    visited.add(nID)
    var totalPairs: Long = 0
    val visitCounter = new NodeVisitCounter()
    var level = 0
    while (q.nonEmpty) {
      val size = q.size
      totalPairs += size
      if (level != 0) {
        levelSize.put(level, size)
      }

      val list: Array[Long] = new Array[Long](size)
      for (x <- 0 until size) {
        list(x) = q.dequeue()
      }
      var children: Array[VertexId] = null
      if (list.length > 0) {
        for (x <- list) {
          val node: VertexId = x
          if (graph.collectNeighborIds(EdgeDirection.Out).lookup(node).nonEmpty) {
            children = graph.collectNeighborIds(EdgeDirection.Out).lookup(node).head
            //        children = hashmap.value.get(x).head
            for (c: Long <- children) {
              // val childNode = graph.vertices.lookup(c) //hashmap.value.get(c).head
              if (!visited.contains(c)) {
                q.enqueue(c)
                visited.add(c)
              }
            }
          }
        }
      }
      level += 1
    }
    totalPairs -= 1

    visitCounter.levelSize = levelSize
    visitCounter.totalPairs = totalPairs

    visitCounter
  }

  private def getClosenessOfVert[VD: ClassTag, ED: ClassTag](vertex: VertexId, graph: Graph[VD, ED]): Double = {
    val visitCenter = BFSNode(vertex, graph)

    var denominator: Long = 0L
    for (x <- visitCenter.levelSize.keySet) {
      denominator += visitCenter.levelSize.get(x).head * x
    }
    if (denominator == 0) return -1
    val count = graph.vertices.count().toDouble
    count / denominator
  }
} 
Example 23
Source File: SSSP.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.workload.spark

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.{Graph, VertexId}

import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    for (dst <- graph.vertices.keys.toLocalIterator) {
      bfs(graph, src, dst)
    }
  }

  private def bfs[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED], src: VertexId, dst: VertexId): Unit = {
    // if (src == dst) return List(src)
    if (src == dst) return

    // The attribute of each vertex is (dist from src, id of vertex with dist-1)
    var g: Graph[(Int, VertexId), ED] = graph.mapVertices((id, _) => (if (id == src) 0 else Int.MaxValue, 0L)).cache()

    // Traverse forward from src
    var dstAttr = (Int.MaxValue, 0L)
    while (dstAttr._1 == Int.MaxValue) {
      val msgs = g.aggregateMessages[(Int, VertexId)](e => if (e.srcAttr._1 != Int.MaxValue && e.srcAttr._1 + 1 < e.dstAttr._1) {
        e.sendToDst((e.srcAttr._1 + 1, e.srcId))
      }, (a, b) => if (a._1 < b._1) a else b).cache()

      // if (msgs.count == 0) return List.empty
      if (msgs.count == 0) return

      g = g.ops.joinVertices(msgs) { (_, oldAttr, newAttr) =>
        if (newAttr._1 < oldAttr._1) newAttr else oldAttr
      }.cache()

      dstAttr = g.vertices.filter(_._1 == dst).first()._2
    }

    // Traverse backward from dst and collect the path
    var path: List[VertexId] = dstAttr._2 :: dst :: Nil
    while (path.head != src) {
      path = g.vertices.filter(_._1 == path.head).first()._2._2 :: path
    }

    // path
  }
} 
Example 24
Source File: BFS.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.workload.spark

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.{Graph, VertexId}

import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    // if (src == dst) return List(src)
    if (src == dst) return

    // The attribute of each vertex is (dist from src, id of vertex with dist-1)
    var g: Graph[(Int, VertexId), ED] = graph.mapVertices((id, _) => (if (id == src) 0 else Int.MaxValue, 0L)).cache()

    // Traverse forward from src
    var dstAttr = (Int.MaxValue, 0L)
    while (dstAttr._1 == Int.MaxValue) {
      val msgs = g.aggregateMessages[(Int, VertexId)](e => if (e.srcAttr._1 != Int.MaxValue && e.srcAttr._1 + 1 < e.dstAttr._1) {
        e.sendToDst((e.srcAttr._1 + 1, e.srcId))
      }, (a, b) => if (a._1 < b._1) a else b).cache()

      // if (msgs.count == 0) return List.empty
      if (msgs.count == 0) return

      g = g.ops.joinVertices(msgs) { (_, oldAttr, newAttr) =>
        if (newAttr._1 < oldAttr._1) newAttr else oldAttr
      }.cache()

      dstAttr = g.vertices.filter(_._1 == dst).first()._2
    }

    // Traverse backward from dst and collect the path
    var path: List[VertexId] = dstAttr._2 :: dst :: Nil
    while (path.head != src) {
      path = g.vertices.filter(_._1 == path.head).first()._2._2 :: path
    }

    // path
  }
} 
Example 25
Source File: ConnectedComponents.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.workload.neo4j

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    val query = "MATCH (n) WITH COLLECT(n) as nodes " +
      "RETURN REDUCE(graphs = [], n in nodes | " +
      "case when " +
      "ANY (g in graphs WHERE shortestPath( (n)-[*]-(g) ) ) " +
      "then graphs " +
      "else graphs + [n]" +
      "end );"

    engine.run(query)
  }
} 
Example 26
Source File: BetweennessCentrality.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.workload.neo4j

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    val query = s"MATCH (n), pthroughn = shortestPath((a)-[*..$hops]->(b)) " +
      "WHERE n IN nodes(pthroughn) AND n <> a AND n <> b AND a <> b " +
      "WITH n,a,b,count(pthroughn) AS sumn " +
      s"MATCH p = shortestPath((a)-[*..$hops]->(b)) " +
      "WITH n, a, b, tofloat(sumn)/ tofloat(count(p)) AS fraction " +
      "RETURN n, sum(fraction);"

    engine.run(query)
  }
} 
Example 27
Source File: StronglyConnectedComponents.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.workload.neo4j

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    val query = "MATCH (n) " +
      "WITH COLLECT(n) as nodes " +
      "RETURN REDUCE(graphs = [], n in nodes | " +
      "case when " +
      "ANY (g in graphs WHERE (shortestPath( (n)-[*]->(g) ) AND shortestPath( (n)<-[*]-(g) ) ) ) " +
      "then graphs " +
      "else graphs + [n] " +
      "end ) "

    engine.run(query)
  }
} 
Example 28
Source File: PageRank.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.workload.neo4j

import edu.msstate.dasi.csb.workload.Workload
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


  def run[VD: ClassTag, ED: ClassTag](graph: Graph[VD, ED]): Unit = {
    val query = "MATCH (a) " +
      "set a.pagerank = 0.0 " +
      "WITH collect(distinct a) AS nodes,count(a) as num_nodes " +
      "UNWIND nodes AS a " +
      "MATCH (a)-[r]-(b) " +
      "WITH a,collect(r) AS rels, count(r) AS num_rels, 1.0/num_nodes AS rank " +
      "UNWIND rels AS rel " +
      "SET endnode(rel).pagerank = " +
      "CASE " +
      "WHEN num_rels > 0 AND id(startnode(rel)) = id(a) THEN " +
      "endnode(rel).pagerank + rank/(num_rels) " +
      "ELSE endnode(rel).pagerank " +
      "END " +
      ",startnode(rel).pagerank = " +
      "CASE " +
      "WHEN num_rels > 0 AND id(endnode(rel)) = id(a) THEN " +
      "startnode(rel).pagerank + rank/(num_rels) " +
      "ELSE startnode(rel).pagerank " +
      "END " +
      "WITH collect(distinct a) AS a,rank " +
      "RETURN a"

    engine.run(query)
  }
} 
Example 29
Source File: SparkPersistence.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.persistence

import java.io.File

import edu.msstate.dasi.csb.model.{EdgeData, VertexData}
import edu.msstate.dasi.csb.sc
import edu.msstate.dasi.csb.util.Util
import org.apache.hadoop.fs.FileUtil
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.storage.StorageLevel

object SparkPersistence extends GraphPersistence {
  private val vertices_suffix = "_vertices"
  private val edges_suffix = "_edges"

  
  def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite: Boolean = false): Unit = {
    val verticesPath = graphName + vertices_suffix
    val verticesTmpPath = "__" + verticesPath
    val edgesPath = graphName + edges_suffix
    val edgesTmpPath = "__" + edgesPath

    if (overwrite) {
      FileUtil.fullyDelete(new File(verticesPath))
      FileUtil.fullyDelete(new File(edgesPath))
    }

    graph.vertices.saveAsTextFile(verticesTmpPath)
    Util.merge(verticesTmpPath, verticesPath)
    FileUtil.fullyDelete(new File(verticesTmpPath))

    graph.edges.saveAsTextFile(edgesTmpPath)
    Util.merge(edgesTmpPath, edgesPath)
    FileUtil.fullyDelete(new File(edgesTmpPath))
  }
} 
Example 30
Source File: LocalClustering$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.measures.vertex.clustering

import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration
import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

class LocalClustering$Test(implicit sc:SparkContext)    extends MeasureTest  {


  "Local clustering for line graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes local clustering")
    val localClustering=LocalClustering.compute(graph)
    Then("Should calculate local clustering correctly")
    val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById should equal (Array(
      (1,0.0), (2,0.0), (3,0.0), (4,0.0), (5,0.0)
    ))
    graph.unpersist(true)
  }

  "Local clustering for line graph" should "be correctly calculated using DSL" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes local clustering")
    val localClustering=graph.localClustering()
    Then("Should calculate local clustering correctly")
    val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById should equal (Array(
      (1,0.0), (2,0.0), (3,0.0), (4,0.0), (5,0.0)
    ))
    graph.unpersist(true)
  }

  "Local clustering for full directed graph " should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes local clustering")
    val localClustering=LocalClustering.compute(graph)
    Then("Should calculate local clustering correctly")
    val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById should equal (Array(
      (1,0.5), (2,0d), (3,0d), (4,0.5)
    ))
    graph.unpersist(true)
  }

  "Local clustering for full undirected graph " should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes local clustering")
    val localClustering=LocalClustering.compute(graph,VertexMeasureConfiguration[Int,Int](true))
    Then("Should calculate local clustering correctly")
    val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById  should equal (Array(
      (1,1), (2,1), (3,1), (4,1)
    ))
    graph.unpersist(true)
  }


  "Local clustering for full directed graph " should "be correctly calculated using iterative approach" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes local clustering")
    val localClustering=LocalClustering.compute(graph)
    val localClusteringIterative=LocalClustering.compute(graph,VertexMeasureConfiguration[Int,Int]((g:Graph[Int,Int])=>1l))
    Then("Should calculate local clustering correctly")
    val verticesSortedById=localClustering.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById should equal (localClusteringIterative.vertices.collect().sortBy{case (vId,data)=>vId})
    graph.unpersist(true)
  }

} 
Example 31
Source File: GraphSynth.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.data.synth

import edu.msstate.dasi.csb.data.distributions.DataDistributions
import edu.msstate.dasi.csb.model.{EdgeData, VertexData}
import edu.msstate.dasi.csb.sc
import edu.msstate.dasi.csb.util.Util
import org.apache.spark.graphx.Graph


  def synthesize(seed: Graph[VertexData, EdgeData], seedDists : DataDistributions, withProperties: Boolean): Graph[VertexData, EdgeData] = {

    var synth = null.asInstanceOf[Graph[VertexData, EdgeData]]

    Util.time( "Gen Graph", {
      synth = genGraph(seed, seedDists)
      println("Vertices #: " + synth.numVertices + ", Edges #: " + synth.numEdges)
    } )

    if (withProperties) {
      Util.time( "Gen Properties", {
        synth = genProperties(synth, seedDists)
        println("Vertices #: " + synth.numVertices + ", Edges #: " + synth.numEdges)
      } )
    }

    synth
  }
} 
Example 32
Source File: PageRank.scala    From MaxCompute-Spark   with Apache License 2.0 5 votes vote down vote up
package com.aliyun.odps.spark.examples.graphx

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object PageRank {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("PageRank")
      .getOrCreate()
    val sc = spark.sparkContext

    // build vertices
    val users: RDD[(VertexId, Array[String])] = sc.parallelize(List(
      "1,BarackObama,Barack Obama",
      "2,ladygaga,Goddess of Love",
      "3,jeresig,John Resig",
      "4,justinbieber,Justin Bieber",
      "6,matei_zaharia,Matei Zaharia",
      "7,odersky,Martin Odersky",
      "8,anonsys"
    ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail)))

    // build edges
    val followers: RDD[Edge[Double]] = sc.parallelize(Array(
      Edge(2L, 1L, 1.0),
      Edge(4L, 1L, 1.0),
      Edge(1L, 2L, 1.0),
      Edge(6L, 3L, 1.0),
      Edge(7L, 3L, 1.0),
      Edge(7L, 6L, 1.0),
      Edge(6L, 7L, 1.0),
      Edge(3L, 7L, 1.0)
    ))

    // build graph
    val followerGraph: Graph[Array[String], Double] = Graph(users, followers)

    // restrict the graph to users with usernames and names
    val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2)

    // compute PageRank
    val pageRankGraph = subgraph.pageRank(0.001)

    // get attributes of the top pagerank users
    val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) {
      case (uid, attrList, Some(pr)) => (pr, attrList.toList)
      case (uid, attrList, None) => (0.0, attrList.toList)
    }

    println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n"))
  }
} 
Example 33
Source File: PageRank.scala    From MaxCompute-Spark   with Apache License 2.0 5 votes vote down vote up
package com.aliyun.odps.spark.examples.graphx

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object PageRank {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("PageRank")
    val sc = new SparkContext(conf)

    // build vertices
    val users: RDD[(VertexId, Array[String])] = sc.parallelize(List(
      "1,BarackObama,Barack Obama",
      "2,ladygaga,Goddess of Love",
      "3,jeresig,John Resig",
      "4,justinbieber,Justin Bieber",
      "6,matei_zaharia,Matei Zaharia",
      "7,odersky,Martin Odersky",
      "8,anonsys"
    ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail)))

    // build edges
    val followers: RDD[Edge[Double]] = sc.parallelize(Array(
      Edge(2L, 1L, 1.0),
      Edge(4L, 1L, 1.0),
      Edge(1L, 2L, 1.0),
      Edge(6L, 3L, 1.0),
      Edge(7L, 3L, 1.0),
      Edge(7L, 6L, 1.0),
      Edge(6L, 7L, 1.0),
      Edge(3L, 7L, 1.0)
    ))

    // build graph
    val followerGraph: Graph[Array[String], Double] = Graph(users, followers)

    // restrict the graph to users with usernames and names
    val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2)

    // compute PageRank
    val pageRankGraph = subgraph.pageRank(0.001)

    // get attributes of the top pagerank users
    val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) {
      case (uid, attrList, Some(pr)) => (pr, attrList.toList)
      case (uid, attrList, None) => (0.0, attrList.toList)
    }

    println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n"))
  }
} 
Example 34
Source File: GodwinTest.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.timeseries.graph

import io.gzet.test.SparkFunSuite
import org.apache.log4j.{Logger, Level}
import org.apache.spark.graphx.{Graph, Edge}
import org.apache.spark.rdd.RDD

import scala.io.Source

class GodwinTest extends SparkFunSuite {

  Logger.getLogger("akka").setLevel(Level.OFF)
  Logger.getLogger("org").setLevel(Level.OFF)

  def buildEdges() = {
    Source.fromInputStream(getClass.getResourceAsStream("/edges.csv")).getLines().drop(1).map(s => {
      val Array(source, target, weight) = s.split(",")
      Edge(source.toLong, target.toLong, weight.toDouble)
    }).toList
  }

  localTest("Test Random Walks") { sc =>
    val edges: RDD[Edge[Double]] = sc.parallelize(buildEdges(), 1)
    val godwin = new Godwin(Seq(16))
    val walks = godwin.randomWalks(Graph.fromEdges(edges, 0L), 4).collect().sortBy(_._2)
    println(walks.map(_._1).mkString(" -> "))
    walks.last._1 should be(16)
  }

} 
Example 35
Source File: GzetCommunitiesTest.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.community

import io.gzet.community.clustering.wcc.WCCDetection
import io.gzet.test.SparkFunSuite
import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.{Graph, Edge}

import scala.io.Source

class GzetCommunitiesTest extends SparkFunSuite {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  localTest("WCC communities") { spark =>

    val lines = Source.fromInputStream(getClass.getResourceAsStream("/local-edges.csv")).getLines().zipWithIndex.filter(_._2 > 0).map(_._1).toSeq
    val sc = spark.sparkContext
    val edges = sc.parallelize(lines).map({ line =>
      val a = line.split(",").map(_.toLong).sorted
      Edge(a.head, a.last, 1L)
    }).distinct()

    val graph = Graph.fromEdges(edges, 0L)

    graph.triplets.take(2).foreach(println)
    val communities = new WCCDetection(1).run(graph, sc)
    communities.map(_._2 -> 1).reduceByKey(_+_).collectAsMap() should be(Map(5L -> 5, 15L -> 6, 21L -> 5))
  }
} 
Example 36
Source File: StoryBatchDedup.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.story

import io.gzet.story.model.{Content, Article}
import org.apache.spark.graphx.{Graph, Edge}
import org.apache.spark.{Logging, SparkConf, SparkContext}
import io.gzet.story.util.SimhashUtils._
import com.datastax.spark.connector._

object StoryBatchDedup extends SimpleConfig with Logging {

  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf().setAppName("Story Extractor")
    val sc = new SparkContext(sparkConf)

    val simhashRDD = sc.cassandraTable[Article]("gzet", "articles").zipWithIndex().map({ case (a, id) =>
      ((id, Content(a.url, a.title, a.body)), a.hash)
    })
    Set(0)

    val duplicateTupleRDD = simhashRDD.flatMap({ case ((id, content), simhash) =>
      searchmasks.map({ mask =>
        (simhash ^ mask, id)
      })
    }).groupByKey()

    val edgeRDD = duplicateTupleRDD.values.flatMap({ it =>
      val list = it.toList
      for (x <- list; y <- list) yield (x, y)
    }).filter({ case (x, y) =>
      x != y
    }).distinct().map({case (x, y) =>
      Edge(x, y, 0)
    })

    val duplicateRDD = Graph.fromEdges(edgeRDD, 0L)
      .connectedComponents()
      .vertices
      .join(simhashRDD.keys)
      .values

    duplicateRDD.sortBy(_._1).collect().foreach({ case (story, content) =>
      println(story + "\t" + content.title)
    })

  }

} 
Example 37
Source File: PlaylistBuilder.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.recommender

import com.datastax.spark.connector._
import com.typesafe.config.Config
import io.gzet.recommender.Config._
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import spark.jobserver._

object PlaylistBuilder extends SparkJob with NamedRddSupport {

  override def runJob(sc: SparkContext, conf: Config): Any = {

    val recordRDD = sc.cassandraTable[Record](KEYSPACE, TABLE_RECORD)
    val hashRDD = sc.cassandraTable[Hash](KEYSPACE, TABLE_HASH)

    val minSimilarityB = sc.broadcast(MIN_SIMILARITY)
    val songIdsB = sc.broadcast(recordRDD.map(r => (r.id, r.name)).collectAsMap())

    implicit class Crossable[X](xs: Traversable[X]) {
      def cross[Y](ys: Traversable[Y]) = for { x <- xs; y <- ys } yield (x, y)
    }

    val songHashRDD = hashRDD flatMap { hash =>
      hash.songs map { song =>
        ((hash, song), 1)
      }
    }

    val songTfRDD = songHashRDD map { case ((hash, songId), count) =>
      (songId, count)
    } reduceByKey(_+_)

    val songTfB = sc.broadcast(songTfRDD.collectAsMap())

    val crossSongRDD = songHashRDD.keys.groupByKey().values flatMap { songIds =>
      songIds cross songIds filter { case (from, to) =>
        from != to
      } map(_ -> 1)
    } reduceByKey(_+_) map { case ((from, to), count) =>
      val weight = count.toDouble / songTfB.value.getOrElse(from, 1)
      org.apache.spark.graphx.Edge(from, to, weight)
    } filter { edge =>
      edge.attr > minSimilarityB.value
    }

    val graph = Graph.fromEdges(crossSongRDD, 0L)
    val prGraph = graph.pageRank(TOLERANCE, TELEPORT)

    val edges = prGraph.edges.map({ edge =>
      (edge.srcId, (edge.dstId, edge.attr))
    }).groupByKey().map({case (srcId, it) =>
      val dst = it.toList
      val dstIds = dst.map(_._1.toString)
      val weights = dst.map(_._2.toString)
      Edge(srcId, dstIds, weights)
    })

    val vertices = prGraph.vertices.mapPartitions({ vertices =>
      val songIds = songIdsB.value
      vertices map { case (vId, pr) =>
        Node(vId, songIds.getOrElse(vId, "UNKNOWN"), pr)
      }
    })

    edges.saveAsCassandraTable(KEYSPACE, TABLE_EDGE)
    vertices.saveAsCassandraTable(KEYSPACE, TABLE_NODE)

    this.namedRdds.update(RDD_EDGE, edges)
    this.namedRdds.update(RDD_NODE, vertices)

  }

  override def validate(sc: SparkContext, config: Config): SparkJobValidation = {
    SparkJobValid
  }



} 
Example 38
package io.gzet.recommender

import com.typesafe.config.Config
import io.gzet.recommender.Config._
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import spark.jobserver._

object PersonalizedPlaylistBuilder extends SparkJob with NamedRddSupport {

  override def runJob(sc: SparkContext, conf: Config): Any = {

    val id = conf.getLong("song.id")

    val edges = this.namedRdds.get[Edge](RDD_EDGE).get
    val nodes = this.namedRdds.get[Node](RDD_NODE).get

    val edgeRDD = edges.flatMap({e =>
      e.targets.zip(e.weights).map({case (target, weight) =>
        org.apache.spark.graphx.Edge(e.source, target.toLong, weight.toDouble)
      })
    })

    val songIdsB = sc.broadcast(nodes.map(n => (n.id, n.name)).collectAsMap())

    val graph = Graph.fromEdges(edgeRDD, 0L)
    graph.cache()
    val prGraph = graph.personalizedPageRank(id, TOLERANCE, TELEPORT)

    prGraph.vertices.mapPartitions({ it =>
      val songIds = songIdsB.value
      it map { case (vId, pr) =>
        (vId, songIds.getOrElse(vId, "UNKNOWN"), pr)
      }
    }).sortBy(_._3, ascending = false).map(v => List(v._1, v._3, v._2).mkString(",")).collect()

  }

  override def validate(sc: SparkContext, config: Config): SparkJobValidation = {
    if(!config.hasPath("song.id")) return SparkJobInvalid("Missing parameter [song.id]")
    if(this.namedRdds.get[Edge](RDD_EDGE).isEmpty) return SparkJobInvalid("Missing RDD [edges]")
    if(this.namedRdds.get[Edge](RDD_NODE).isEmpty) return SparkJobInvalid("Missing RDD [nodes]")
    SparkJobValid
  }

} 
Example 39
Source File: EmployeeRelationship.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples.graphx

import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.rdd.RDD
import org.apache.spark.graphx.{ Edge, Graph }


object EmployeeRelationship {
	def main(args: Array[String]): Unit = {
		// vertex format: vertex_id, data
		val vertexArray = Array(
			(1L, ("John", "Software Developer")),
			(2L, ("Robert", "Technical Leader")),
			(3L, ("Charlie", "Software Architect")),
			(4L, ("David", "Software Developer")),
			(5L, ("Edward", "Software Development Manager")),
			(6L, ("Francesca", "Software Development Manager")))

		// edge format: from_vertex_id, to_vertex_id, data
		val edgeArray = Array(
			Edge(2L, 1L, "Technical Mentor"),
			Edge(2L, 4L, "Technical Mentor"),
			Edge(3L, 2L, "Collaborator"),
			Edge(6L, 3L, "Team Member"),
			Edge(4L, 1L, "Peers"),
			Edge(5L, 2L, "Team Member"),
			Edge(5L, 3L, "Team Member"),
			Edge(5L, 6L, "Peers"))

		val sc = new SparkContext(new SparkConf().setAppName("EmployeeRelationshipJob"))

		val vertexRDD: RDD[(Long, (String, String))] = sc.parallelize(vertexArray)

		val edgeRDD: RDD[Edge[String]] = sc.parallelize(edgeArray)

		val graph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD)

		// Vanilla query
		println(">>> Showing the names of people who are Software Developers")
		graph.vertices.filter { case (id, (name, designation)) => designation.equals("Software Developer") }
			.collect()
			.foreach { case (id, (name, designation)) => println(s"... Name: $name, Designation: $designation") }

		// Connection analysis
		println(">>> People connected to Robert (Technical Leader) -> ")
		graph.triplets.filter(_.srcId == 2).collect()
			.foreach { item => println("... " + item.dstAttr._1 + ", " + item.dstAttr._2) }

		println(">>> Robert (Technical Leader) connected to -> ")
		graph.triplets.filter(_.dstId == 2).collect()
			.foreach { item => println("... " + item.srcAttr._1 + ", " + item.srcAttr._2) }

		println(">>> Technical Mentoring Analysis -> ")
		graph.triplets.filter(_.attr.equals("Technical Mentor")).collect()
			.foreach { item => println("... " + item.srcAttr._1 + " mentoring " + item.dstAttr._1) }
	}
} 
Example 40
Source File: SSSPExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.graphx

// $example on$
import org.apache.spark.graphx.{Graph, VertexId}
import org.apache.spark.graphx.util.GraphGenerators
// $example off$
import org.apache.spark.sql.SparkSession


object SSSPExample {
  def main(args: Array[String]): Unit = {
    // Creates a SparkSession.
    val spark = SparkSession
      .builder
      .appName(s"${this.getClass.getSimpleName}")
      .getOrCreate()
    val sc = spark.sparkContext

    // $example on$
    // A graph with edge attributes containing distances
    val graph: Graph[Long, Double] =
      GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble)
    val sourceId: VertexId = 42 // The ultimate source
    // Initialize the graph such that all vertices except the root have distance infinity.
    val initialGraph = graph.mapVertices((id, _) =>
        if (id == sourceId) 0.0 else Double.PositiveInfinity)
    val sssp = initialGraph.pregel(Double.PositiveInfinity)(
      (id, dist, newDist) => math.min(dist, newDist), // Vertex Program
      triplet => {  // Send Message
        if (triplet.srcAttr + triplet.attr < triplet.dstAttr) {
          Iterator((triplet.dstId, triplet.srcAttr + triplet.attr))
        } else {
          Iterator.empty
        }
      },
      (a, b) => math.min(a, b) // Merge Message
    )
    println(sssp.vertices.collect.mkString("\n"))
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 41
Source File: FindInfluencer.scala    From spark-graphx-twitter   with Apache License 2.0 5 votes vote down vote up
package com.knoldus.spark.graphx.example

import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object FindInfluencer {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("Twittter Influencer").setMaster("local[*]")
    val sparkContext = new SparkContext(conf)
    sparkContext.setLogLevel("ERROR")

    val twitterData = sparkContext.textFile("src/main/resources/twitter-graph-data.txt")

    val followeeVertices: RDD[(VertexId, String)] = twitterData.map(_.split(",")).map { arr =>
      val user = arr(0).replace("((", "")
      val id = arr(1).replace(")", "")
      (id.toLong, user)
    }

    val followerVertices: RDD[(VertexId, String)] = twitterData.map(_.split(",")).map { arr =>
      val user = arr(2).replace("(", "")
      val id = arr(3).replace("))", "")
      (id.toLong, user)
    }

    val vertices = followeeVertices.union(followerVertices)
    val edges: RDD[Edge[String]] = twitterData.map(_.split(",")).map { arr =>
      val followeeId = arr(1).replace(")", "").toLong
      val followerId = arr(3).replace("))", "").toLong
      Edge(followeeId, followerId, "follow")
    }

    val defaultUser = ("")
    val graph = Graph(vertices, edges, defaultUser)

    val subGraph = graph.pregel("", 2, EdgeDirection.In)((_, attr, msg) =>
      attr + "," + msg,
      triplet => Iterator((triplet.srcId, triplet.dstAttr)),
      (a, b) => (a + "," + b))

    val lengthRDD = subGraph.vertices.map(vertex => (vertex._1, vertex._2.split(",").distinct.length - 2)).max()(new Ordering[Tuple2[VertexId, Int]]() {
      override def compare(x: (VertexId, Int), y: (VertexId, Int)): Int =
        Ordering[Int].compare(x._2, y._2)
    })

    val userId = graph.vertices.filter(_._1 == lengthRDD._1).map(_._2).collect().head
    println(userId + " has maximum influence on network with " + lengthRDD._2 + " influencers.")

    sparkContext.stop()
  }
} 
Example 42
Source File: ZombieExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.graph

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, _}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession


object ZombieExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val vertexJsonFile = args(0)
    val edgeJsonFile = args(1)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex]
    val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge]

    val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => {
      (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive))
    })

    val edgeRdd = edgeDs.rdd.map(r => {
      new Edge[String](r.src, r.dst, r.edge_type)
    })

    val defaultUser = new ZombieStats(false, 0)

    val graph = Graph(vectorRdd, edgeRdd, defaultUser)

    val zombieResults = graph.pregel[Long](0, 30, EdgeDirection.Either)(
      (vertexId, zombieState, message) => {
        if (message > 0 && !zombieState.isZombie) {
          new ZombieStats(true, message)
        } else {
          zombieState
        }
      }, triplet => {
        if (triplet.srcAttr.isZombie && !triplet.dstAttr.isZombie) {
          Iterator((triplet.dstId, triplet.srcAttr.lengthOfLife + 1l))
        } else if (triplet.dstAttr.isZombie && !triplet.srcAttr.isZombie) {
          Iterator((triplet.srcId, triplet.dstAttr.lengthOfLife + 1l))
        } else {
          Iterator.empty
        }
      }, (a, b) => Math.min(a, b))

    println("ZombieBite")
    zombieResults.vertices.collect().sortBy(r => r._1).foreach(r => {
      println("vertexId:" + r._1 + ",ZobmieStat:" + r._2)
    })

    sparkSession.stop()
  }
}

case class ZombieStats (isZombie:Boolean, lengthOfLife:Long) 
Example 43
Source File: PeriodicGraphCheckpointer.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.impl

import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.storage.StorageLevel



private[mllib] class PeriodicGraphCheckpointer[VD, ED](
    checkpointInterval: Int,
    sc: SparkContext)
  extends PeriodicCheckpointer[Graph[VD, ED]](checkpointInterval, sc) {

  override protected def checkpoint(data: Graph[VD, ED]): Unit = data.checkpoint()

  override protected def isCheckpointed(data: Graph[VD, ED]): Boolean = data.isCheckpointed

  override protected def persist(data: Graph[VD, ED]): Unit = {
    if (data.vertices.getStorageLevel == StorageLevel.NONE) {
      data.vertices.persist()
    }
    if (data.edges.getStorageLevel == StorageLevel.NONE) {
      data.edges.persist()
    }
  }

  override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false)

  override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = {
    data.getCheckpointFiles
  }
} 
Example 44
Source File: SSSPExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.graphx

// $example on$
import org.apache.spark.graphx.{Graph, VertexId}
import org.apache.spark.graphx.util.GraphGenerators
// $example off$
import org.apache.spark.sql.SparkSession


object SSSPExample {
  def main(args: Array[String]): Unit = {
    // Creates a SparkSession.
    val spark = SparkSession
      .builder
      .appName(s"${this.getClass.getSimpleName}")
      .getOrCreate()
    val sc = spark.sparkContext

    // $example on$
    // A graph with edge attributes containing distances
    val graph: Graph[Long, Double] =
      GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble)
    val sourceId: VertexId = 42 // The ultimate source
    // Initialize the graph such that all vertices except the root have distance infinity.
    val initialGraph = graph.mapVertices((id, _) =>
        if (id == sourceId) 0.0 else Double.PositiveInfinity)
    val sssp = initialGraph.pregel(Double.PositiveInfinity)(
      (id, dist, newDist) => math.min(dist, newDist), // Vertex Program
      triplet => {  // Send Message
        if (triplet.srcAttr + triplet.attr < triplet.dstAttr) {
          Iterator((triplet.dstId, triplet.srcAttr + triplet.attr))
        } else {
          Iterator.empty
        }
      },
      (a, b) => math.min(a, b) // Merge Message
    )
    println(sssp.vertices.collect.mkString("\n"))
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 45
Source File: L10-9Graph.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.Graph.graphToGraphOps
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.json4s.DefaultFormats
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object UserRankApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: UserRankApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    ssc.socketTextStream(hostname, port.toInt)
      .map(r => {
        implicit val formats = DefaultFormats
        parse(r)
      })
      .foreachRDD(rdd => {
        val edges = rdd.map(jvalue => {
          implicit val formats = DefaultFormats
          ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]])
        })
          .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0)))

        val vertices = rdd.map(jvalue => {
          implicit val formats = DefaultFormats
          ((jvalue \ "user_id").extract[String])
        })
          .map(r => (r.hashCode.toLong, r))

        val tolerance = 0.0001
        val graph = Graph(vertices, edges, "defaultUser")
          .subgraph(vpred = (id, idStr) => idStr != "defaultUser")
        val pr = graph.pageRank(tolerance).cache

        graph.outerJoinVertices(pr.vertices) {
          (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs)
        }.vertices.top(10) {
          Ordering.by(_._2._1)
        }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1)))
      })

    ssc.start()
    ssc.awaitTermination()

  }

} 
Example 46
Source File: EdgeAPI.scala    From Hands-On-Big-Data-Analytics-with-PySpark   with MIT License 5 votes vote down vote up
package com.tomekl007.chapter_7

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

class EdgeAPI extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("Should use Edge API") {
    //given
    val users: RDD[(VertexId, (String))] =
      spark.parallelize(Array(
        (1L, "a"),
        (2L, "b"),
        (3L, "c"),
        (4L, "d")
      ))


    val relationships =
      spark.parallelize(Array(
        Edge(1L, 2L, "friend"),
        Edge(1L, 3L, "friend"),
        Edge(2L, 4L, "wife")
      ))

    val graph = Graph(users, relationships)

    //when
    val res = graph.mapEdges(e => e.attr.toUpperCase)

    println(res.edges.collect().toList)
  }

} 
Example 47
Source File: VertexAPI.scala    From Hands-On-Big-Data-Analytics-with-PySpark   with MIT License 5 votes vote down vote up
package com.tomekl007.chapter_7

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

class VertexAPI extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("Should use Vertex API") {
    //given
    val users: RDD[(VertexId, (String))] =
      spark.parallelize(Array(
        (1L, "a"),
        (2L, "b"),
        (3L, "c"),
        (4L, "d")
      ))


    val relationships =
      spark.parallelize(Array(
        Edge(1L, 2L, "friend"),
        Edge(1L, 3L, "friend"),
        Edge(2L, 4L, "wife")
      ))

    val graph = Graph(users, relationships)

    //when
    val res = graph.mapVertices((_, att) => att.toUpperCase())
    res.vertices.collect().toList
  }

} 
Example 48
Source File: PeriodicGraphCheckpointer.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.impl

import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.storage.StorageLevel



private[mllib] class PeriodicGraphCheckpointer[VD, ED](
    checkpointInterval: Int,
    sc: SparkContext)
  extends PeriodicCheckpointer[Graph[VD, ED]](checkpointInterval, sc) {

  override protected def checkpoint(data: Graph[VD, ED]): Unit = data.checkpoint()

  override protected def isCheckpointed(data: Graph[VD, ED]): Boolean = data.isCheckpointed

  override protected def persist(data: Graph[VD, ED]): Unit = {
    if (data.vertices.getStorageLevel == StorageLevel.NONE) {
      data.vertices.persist()
    }
    if (data.edges.getStorageLevel == StorageLevel.NONE) {
      data.edges.persist()
    }
  }

  override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false)

  override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = {
    data.getCheckpointFiles
  }
} 
Example 49
Source File: LocalRunner.scala    From spark-betweenness   with Apache License 2.0 5 votes vote down vote up
package com.centrality.kBC

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD

object MainRunner 
{
  def main(args: Array[String])
  {
    // Create spark context
    val appName="kBC"
    val sparkMode="local"
    val conf = new SparkConf().setAppName(appName).setMaster(sparkMode);
    val sc = new SparkContext(conf);
    
    // Create sample graph
    //
    // Create an RDD for vertices
    val users: RDD[(VertexId, (String, String))] =
    sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                         (5L, ("franklin", "prof")), (2L, ("istoica", "prof"))))
    // Create an RDD for edges
    val relationships: RDD[Edge[String]] =
      sc.parallelize(Array(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
                           Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi")))
    // Define a default user in case there are relationship with missing user
    val defaultUser = ("John Doe", "Missing")
    // Build the initial Graph
    val graph = Graph(users, relationships, defaultUser)
    
    val kBCGraph = 
      KBetweenness.run(graph, 3)
  }
} 
Example 50
package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
//import org.graphframes._


object GraphFramesExample extends App {

    val conf = new SparkConf()
      .setAppName("RDD graph")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)


    val vertices: RDD[(VertexId, String)] = sc.parallelize(
      Array((1L, "Anne"),
        (2L, "Bernie"),
        (3L, "Chris"),
        (4L, "Don"),
        (5L, "Edgar")))

    val edges: RDD[Edge[String]] = sc.parallelize(
      Array(Edge(1L, 2L, "likes"),
        Edge(2L, 3L, "trusts"),
        Edge(3L, 4L, "believes"),
        Edge(4L, 5L, "worships"),
        Edge(1L, 3L, "loves"),
        Edge(4L, 1L, "dislikes")))

    val friendGraph: Graph[String, String] = Graph(vertices, edges)

//    val friendGraphFrame = GraphFrame.fromGraphX(friendGraph)
//
//    friendGraphFrame.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3)").filter(
//      "e1.attr = 'trusts' OR v3.attr = 'Chris'"
//    ).collect.foreach(println)

} 
Example 51
Source File: Gephi.scala    From Mastering-Machine-Learning-with-Spark-2.x   with MIT License 5 votes vote down vote up
package com.github.maxpumperla.ml_spark.utils

import org.apache.spark.graphx.Graph


object Gephi {

  def toGexf[VD, ED](g: Graph[VD, ED]): String = {
    val header =
      """<?xml version="1.0" encoding="UTF-8"?>
        |<gexf xmlns="http://www.gexf.net/1.2draft" version="1.2">
        |  <meta>
        |    <description>A gephi graph in GEXF format</description>
        |  </meta>
        |    <graph mode="static" defaultedgetype="directed">
      """.stripMargin

    val vertices = "<nodes>\n" + g.vertices.map(
      v => s"""<node id=\"${v._1}\" label=\"${v._2}\"/>\n"""
    ).collect.mkString + "</nodes>\n"

    val edges = "<edges>\n" + g.edges.map(
      e => s"""<edge source=\"${e.srcId}\" target=\"${e.dstId}\" label=\"${e.attr}\"/>\n"""
    ).collect.mkString + "</edges>\n"

    val footer = "</graph>\n</gexf>"

    header + vertices + edges + footer
  }
} 
Example 52
Source File: CCGraphXDriver.scala    From connected-component   with MIT License 5 votes vote down vote up
package com.kwartile.lib.cc

import org.apache.spark.graphx.{Edge, Graph}
import org.apache.spark.{SparkConf, SparkContext}

import scala.annotation.tailrec



object CCGraphXDriver {

  @tailrec
  private def buildEdges(node: Long, neighbors:List[Long], partialPairs: List[Edge[Int]]) : List[Edge[Int]] = {
    if (neighbors.length == 0) {
      if (partialPairs != null)
        List(Edge(node, node, 1)) ::: partialPairs
      else
        List(Edge(node, node, 1))
    } else if (neighbors.length == 1) {
      val neighbor = neighbors(0)
      if (node > neighbor)
        if (partialPairs != null) List(Edge(node, neighbor, 1)) ::: partialPairs else List(Edge(node, neighbor, 1))
      else
      if (partialPairs != null) List(Edge(neighbor, node, 1)) ::: partialPairs else List(Edge(neighbor, node, 1))
    } else {
      val newPartialPairs = neighbors.map(neighbor => {
        if (node > neighbor)
          List(Edge(node, neighbor, 1))
        else
          List(Edge(neighbor, node, 1))
      }).flatMap(x=>x)

      if (partialPairs != null)
        buildEdges(neighbors.head, neighbors.tail, newPartialPairs ::: partialPairs)
      else
        buildEdges(neighbors.head, neighbors.tail, newPartialPairs)
    }
  }

  private def buildEdges(nodes:List[Long]) :  List[Edge[Int]] = {
    buildEdges(nodes.head, nodes.tail, null.asInstanceOf[List[Edge[Int]]])
  }

  def main(args: Array[String]) = {
    val sparkConf = new SparkConf().setAppName("GraphXConnectedComponent")

    val sc = new SparkContext(sparkConf)

    val cliqueFile = args(0)
    val cliquesRec = sc.textFile(args(0))
    val cliques = cliquesRec.map(x => {
      val nodes = x.split("\\s+").map(y => y.toLong).toList
      nodes
    })

    val edges = cliques.map(aClique => {
      buildEdges(aClique)
    }).flatMap(x=>x)

    val graph = Graph.fromEdges(edges, 1)
    val cc = graph.connectedComponents().vertices
    println ("Count of Connected component: " + cc.count)
  }
} 
Example 53
Source File: PipeClusteringStrongestPath.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.clustering

import scala.Iterator

import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexRDD

import de.unihamburg.vsis.sddf.reading.Tuple


class PipeClusteringStrongestPath extends PipeClusteringTransitiveClosure {
  
  override def manipulateGraph(graph: Graph[Tuple, Double]): Graph[_, Double] = {

    val cGraph = graph.mapVertices((vid, tuple) => (vid, Double.MinPositiveValue))

    // attach the max adjacent edge attribute to each vertice
    val verticesMaxEdgeAttributes: VertexRDD[Double] = cGraph.mapReduceTriplets(
      edge => {
        Iterator((edge.dstId, edge.attr), (edge.srcId, edge.attr))
      },
      (a: Double, b: Double) => math.max(a, b)
    )

    // join the resulting vertice attributes with the graph
    val maxGraph: Graph[(Tuple, Double), Double] =
      graph.outerJoinVertices(verticesMaxEdgeAttributes)((id, tuple, simOpt) =>
        simOpt match {
          case Some(sim) => (tuple, sim)
          case None      => (tuple, 0D)
        }
      )
      
    // remove edges which have a max value less then src or dst 
    val resultGraph = maxGraph.subgraph(edge => {
      if (edge.attr < edge.srcAttr._2 && edge.attr < edge.dstAttr._2) {
        false
      } else {
        true
      }
    })
    resultGraph
  }

}

object PipeClusteringStrongestPath {
  
  def apply() = new PipeClusteringStrongestPath()

} 
Example 54
Source File: AbstractPipeClusteringGraph.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.clustering

import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.similarity.aggregator.Mean
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable


abstract class AbstractPipeClusteringGraph
  extends PipeElement[RDD[(SymPair[Tuple], Array[Double])], RDD[Set[Tuple]]]
  with Serializable {
  
  def cluster(graph: Graph[Tuple, Double]): RDD[Set[Tuple]]

  def step(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): RDD[Set[Tuple]] = {
    
    val duplicatePairsWithSimilarity = input.map(
      pair => (pair._1, Mean.agrSimilarity(pair._2))
    )
    
    val edges: RDD[Edge[Double]] = duplicatePairsWithSimilarity.map(
      pair => { Edge(pair._1._1.id, pair._1._2.id, pair._2) }
    )

    // TODO optimize: it would be nice to build the graph only by using edge triplets
    // but as far as I know that's not possible
    val verticesNotUnique: RDD[(VertexId, Tuple)] = duplicatePairsWithSimilarity.map(_._1).flatMap(
      tuplePair => Seq(tuplePair._1, tuplePair._2)
    ).map(tuple => (tuple.id, tuple))

    // delete all duplicate vertices
    val vertices = verticesNotUnique.distinct()

    // The edge type Boolean is just a workaround because no edge types are needed
    val graph: Graph[Tuple, Double] = Graph.apply(vertices, edges, null)
    
    cluster(graph)
  }

} 
Example 55
Source File: AffinityPropagationSuite.scala    From SparkAffinityPropagation   with MIT License 5 votes vote down vote up
package org.viirya.spark.ml

import scala.collection.mutable

import org.scalatest.{BeforeAndAfterAll, FunSuite, Suite}

import org.viirya.spark.ml.AffinityPropagation._

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.{Edge, Graph}

class AffinityPropagationSuite extends FunSuite with BeforeAndAfterAll { self: Suite =>
  @transient var sc: SparkContext = _

  override def beforeAll() {
    super.beforeAll()
    val conf = new SparkConf()
      .setMaster("local[2]")
      .setAppName("AffinityPropagationUnitTest")
    sc = new SparkContext(conf)
  }

  override def afterAll() {
    try {
      if (sc != null) {
        sc.stop()
      }
      sc = null
    } finally {
      super.afterAll()
    }
  }  

  test("affinity propagation") {
    
    val similarities = Seq[(Long, Long, Double)](
      (0, 1, 1.0), (1, 0, 1.0), (0, 2, 1.0), (2, 0, 1.0), (0, 3, 1.0), (3, 0, 1.0),
      (1, 2, 1.0), (2, 1, 1.0), (2, 3, 1.0), (3, 2, 1.0))
    val expected = Array(
      Array(0.0,     1.0/3.0, 1.0/3.0, 1.0/3.0),
      Array(1.0/2.0,     0.0, 1.0/2.0,     0.0),
      Array(1.0/3.0, 1.0/3.0,     0.0, 1.0/3.0),
      Array(1.0/2.0,     0.0, 1.0/2.0,     0.0))
    val s = constructGraph(sc.parallelize(similarities, 2), true, false)
    s.edges.collect().foreach { case Edge(i, j, x) =>
      assert(math.abs(x.similarity - expected(i.toInt)(j.toInt)) < 1e-14)
    }
  }
} 
Example 56
Source File: PeriodicGraphCheckpointer.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.impl

import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.storage.StorageLevel



private[mllib] class PeriodicGraphCheckpointer[VD, ED](
    checkpointInterval: Int,
    sc: SparkContext)
  extends PeriodicCheckpointer[Graph[VD, ED]](checkpointInterval, sc) {

  override protected def checkpoint(data: Graph[VD, ED]): Unit = data.checkpoint()

  override protected def isCheckpointed(data: Graph[VD, ED]): Boolean = data.isCheckpointed

  override protected def persist(data: Graph[VD, ED]): Unit = {
    if (data.vertices.getStorageLevel == StorageLevel.NONE) {
      data.vertices.persist()
    }
    if (data.edges.getStorageLevel == StorageLevel.NONE) {
      data.edges.persist()
    }
  }

  override protected def unpersist(data: Graph[VD, ED]): Unit = data.unpersist(blocking = false)

  override protected def getCheckpointFiles(data: Graph[VD, ED]): Iterable[String] = {
    data.getCheckpointFiles
  }
} 
Example 57
Source File: BasicLinkPredictor.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.algorithms.link

import ml.sparkling.graph.api.operators.algorithms.link.MeasureBasedLnkPredictor
import ml.sparkling.graph.api.operators.measures.EdgeMeasure
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


object BasicLinkPredictor extends MeasureBasedLnkPredictor {

  override def predictLinks[V: ClassTag, E: ClassTag, EV: ClassTag, EO: ClassTag](graph: Graph[V, E],
                                                                                  edgeMeasure: EdgeMeasure[EO, EV],
                                                                                  threshold: EO,
                                                                                  treatAsUndirected:Boolean=false)(implicit num: Numeric[EO]) = {
    val preprocessedGraph=edgeMeasure.preprocess(graph,treatAsUndirected)
    val allPossibleEdges = preprocessedGraph.vertices.cartesian(preprocessedGraph.vertices).filter{
      case ((vId1,data1),(vId2,data2))=>vId1!=vId2
    }
    val edgesAboveThreshold=allPossibleEdges.map{
      case ((vId1,data1),(vId2,data2))=>(edgeMeasure.computeValue(data1,data2,treatAsUndirected),(vId1,vId2))
    }.filter(t=>num.gt(t._1,threshold)).map(t=>(t._2,0))
    val exsistingEdgesTuples=graph.edges.map(e=>((e.srcId,e.dstId),0))
    val newEdges=edgesAboveThreshold.leftOuterJoin(exsistingEdgesTuples).filter{
      case (k,(_,option))=>option.isEmpty
    }.map(_._1)
    if(treatAsUndirected){
      newEdges.map{
        case (vId1,vId2)=>(Math.min(vId1,vId2),Math.max(vId1,vId2))
      }.distinct()
    }else{
      newEdges
    }
  }

} 
Example 58
Source File: BetweennessEdmonds$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.measures.vertex.betweenness.edmonds

import java.nio.file.Files

import ml.sparkling.graph.operators.MeasureTest
import org.apache.commons.io.FileUtils
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Graph, VertexRDD}


class BetweennessEdmonds$Test(implicit sc: SparkContext) extends MeasureTest {
  val tempDir = Files.createTempDirectory("spark-checkpoint")

  override def beforeAll() = {
    sc.setCheckpointDir(tempDir.toAbsolutePath.toString)
  }

  override def afterAll() = {
    FileUtils.deleteDirectory(tempDir.toFile)
  }

  "Edmonds betweenness centrality for random graph" should "be correctly calculated" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/graph_ER_15")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes betweenness")
    val result = EdmondsBC.computeBC(graph)
    Then("Should calculate betweenness correctly")
    val bcFile = getClass.getResource("/graphs/graph_ER_15_bc")
    val bcCorrectValues = sc.textFile(bcFile.getPath)
      .filter(_.nonEmpty)
      .map(l => { val t = l.split("\t", 2); (t(0).toInt, t(1).toDouble) })
      .sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data}).collect()
    val bcValues = result.sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data }).collect()
    bcCorrectValues.zip(bcValues).foreach({ case (a, b) =>
      a should be(b +- 1e-5)
    })

    result.unpersist(false)
  }

} 
Example 59
Source File: SSSPExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.graphx

// $example on$
import org.apache.spark.graphx.{Graph, VertexId}
import org.apache.spark.graphx.util.GraphGenerators
// $example off$
import org.apache.spark.sql.SparkSession


object SSSPExample {
  def main(args: Array[String]): Unit = {
    // Creates a SparkSession.
    val spark = SparkSession
      .builder
      .appName(s"${this.getClass.getSimpleName}")
      .getOrCreate()
    val sc = spark.sparkContext

    // $example on$
    // A graph with edge attributes containing distances
    val graph: Graph[Long, Double] =
      GraphGenerators.logNormalGraph(sc, numVertices = 100).mapEdges(e => e.attr.toDouble)
    val sourceId: VertexId = 42 // The ultimate source
    // Initialize the graph such that all vertices except the root have distance infinity.
    val initialGraph = graph.mapVertices((id, _) =>
        if (id == sourceId) 0.0 else Double.PositiveInfinity)
    val sssp = initialGraph.pregel(Double.PositiveInfinity)(
      (id, dist, newDist) => math.min(dist, newDist), // Vertex Program
      triplet => {  // Send Message
        if (triplet.srcAttr + triplet.attr < triplet.dstAttr) {
          Iterator((triplet.dstId, triplet.srcAttr + triplet.attr))
        } else {
          Iterator.empty
        }
      },
      (a, b) => math.min(a, b) // Merge Message
    )
    println(sssp.vertices.collect.mkString("\n"))
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 60
Source File: EigenvectorCentrality$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.measures.vertex.eigenvector

import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration
import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

import scala.util.Random

class EigenvectorCentrality$Test(implicit sc:SparkContext)   extends MeasureTest  {




  "Eigenvector  for line graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes eigenvector")
    val result=EigenvectorCentrality.compute(graph)
    Then("Should calculate eigenvector correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array(
      0d, 0d, 0d, 0d, 0d
    )).foreach{case (a,b)=>{a should be (b +- 1e-5 )}}
    graph.unpersist(true)
  }

  "Eigenvector  for line graph" should "be correctly calculated using DSL" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes eigenvector")
    val result=graph.eigenvectorCentrality()
    Then("Should calculate eigenvector correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array(
      0d, 0d, 0d, 0d, 0d
    )).foreach{case (a,b)=>{a should be (b +- 1e-5 )}}
    graph.unpersist(true)
  }

  "Eigenvector  for full 4 node directed graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes eigenvector")
    val result=EigenvectorCentrality.compute(graph)
    Then("Should calculate eigenvector correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId}.map{case (vId,data)=>data}.zip(Array(
      0.32128186442503776, 0.5515795539542094, 0.6256715148839718, 0.44841176915201825
    )).foreach{case (a,b)=>{a should be (b +- 1e-5 )}}
    graph.unpersist(true)
  }

  "Eigenvector  for full 4 node undirected graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes eigenvector")
    val result=EigenvectorCentrality.compute(graph,VertexMeasureConfiguration[Int,Int](true))
    Then("Should calculate eigenvector correctly")
    result.vertices.collect().sortBy{case (vId,data)=>vId} should equal (Array(
      (1,0.5), (2,0.5), (3,0.5), (4,0.5)
    ))
    graph.unpersist(true)
  }

  "Eigenvector " should " take edge weight into account" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    val graphWeighted=graph.mapEdges(edge=>{
      1.0/(edge.srcId+edge.dstId)
    })
    When("Computes eigenvector")
    val resultUnweighted=EigenvectorCentrality.compute(graph,VertexMeasureConfiguration[Int,Int](true))
    val resultWeighted=EigenvectorCentrality.compute(graphWeighted,VertexMeasureConfiguration[Int,Double](true))
    Then("Should calculate eigenvector correctly")
    resultUnweighted.vertices.collect().sortBy{case (vId,data)=>vId} should not equal (
      resultWeighted.vertices.collect().sortBy{case (vId,data)=>vId})
    graph.unpersist(true)
    resultUnweighted.unpersist(true)
    resultWeighted.unpersist(true)
  }



} 
Example 61
Source File: AdamicAdar$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.measures.edge

import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

class AdamicAdar$Test(implicit sc:SparkContext) extends MeasureTest {


   "Adamic/Adar for star graph" should "be 0 for each node" in{
     Given("graph")
     val filePath = getClass.getResource("/graphs/6_nodes_star")
     val graph:Graph[Int,Int]=loadGraph(filePath.toString)
     When("Computes Adamic/Adar")
     val result=AdamicAdar.computeWithPreprocessing(graph)
     Then("Should calculate Adamic/Adar")
     val resultValues=result.edges.map(_.attr).distinct().collect()
     resultValues(0) should equal(0)
     resultValues.size should equal(1)
     graph.unpersist(true)
   }

   "Adamic/Adar for full graph using DSL" should "be 1.8205 for each node" in{
     Given("graph")
     val filePath = getClass.getResource("/graphs/4_nodes_full")
     val graph:Graph[Int,Int]=loadGraph(filePath.toString)
     When("Computes Adamic/Adar")
     val result=graph.adamicAdar(true)
     Then("Should calculate Adamic/Adar")
     val resultValues=result.edges.map(_.attr).distinct().collect()
     resultValues(0) should equal(1.82047 +- 1e-5)
     resultValues.size should equal(1)
     graph.unpersist(true)
   }


 } 
Example 62
Source File: CommonNeighbours$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.measures.edge

import ml.sparkling.graph.operators.MeasureTest
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

class CommonNeighbours$Test (implicit sc:SparkContext)   extends MeasureTest {


  "Common neighbours for star graph" should "be 0 for each node" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/6_nodes_star")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes common neighbours")
    val result=CommonNeighbours.computeWithPreprocessing(graph)
    Then("Should calculate common neighbours")
    val resultValues=result.edges.map(_.attr).distinct().collect()
    resultValues(0) should equal(0)
    resultValues.size should equal(1)
  }

  "Common neighbours for full graph using DSL" should "be 2 for each node" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes common neighbours")
    val result=graph.commonNeighbours(true)
    Then("Should calculate common neighbours")
    val resultValues=result.edges.map(_.attr).distinct().collect()
    resultValues(0) should equal(2)
    resultValues.size should equal(1)
  }


} 
Example 63
Source File: NeighborhoodConnectivity$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.measures

import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration
import ml.sparkling.graph.operators.MeasureTest
import ml.sparkling.graph.operators.measures.vertex.NeighborhoodConnectivity
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

class NeighborhoodConnectivity$Test(implicit sc:SparkContext)  extends MeasureTest {



  "Neighbor connectivity for directed line graph" should "be correctly calculated" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes Neighbor connectivity ")
    val result = NeighborhoodConnectivity.compute(graph)
    Then("Should calculate Neighbor connectivity  correctly")
    val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById .map{case (vId,data)=>data} should equal (Array(
      1d,1d,1d,0d,0d
    ))
    graph.unpersist(true)
  }

  "Neighbor connectivity for directed line graph" should "be correctly calculated when using DSL" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes Neighbor connectivity ")
    val result = graph.neighborhoodConnectivity()
    Then("Should calculate Neighbor connectivity  correctly")
    val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById .map{case (vId,data)=>data} should equal (Array(
      1d,1d,1d,0d,0d
    ))
    graph.unpersist(true)
  }

  "Neighbor connectivity for undirected line graph" should "be correctly calculated" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes Neighbor connectivity ")
    val result = NeighborhoodConnectivity.compute(graph,VertexMeasureConfiguration[Int,Int](true))
    Then("Should calculate Neighbor connectivity  correctly")
    val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById .map{case (vId,data)=>data} should equal (Array(
      2d,1.5,2d,1.5,2d
    ))
    graph.unpersist(true)
  }

  "Neighbor connectivity for full 4 node directed graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes Neighbor connectivity")
    val result=NeighborhoodConnectivity.compute(graph)
    Then("Should calculate Neighbor connectivity correctly")
    val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById .map{case (vId,data)=>data} should equal (Array(
      1d,1d,2d,1.5
    ))
    graph.unpersist(true)
  }

  "Neighbor connectivity for full 4 node undirected graph" should "be correctly calculated" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Computes Neighbor connectivity")
    val result=NeighborhoodConnectivity.compute(graph,VertexMeasureConfiguration[Int,Int](true))
    Then("Should calculate Neighbor connectivity correctly")
    val verticesSortedById=result.vertices.collect().sortBy{case (vId,data)=>vId}
    verticesSortedById .map{case (vId,data)=>data} should equal (Array(
      3d,3d,3d,3d
    ))
    graph.unpersist(true)
  }

} 
Example 64
Source File: CommunityBasedPartitioning$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.partitioning

import ml.sparkling.graph.loaders.csv.CSVLoader
import ml.sparkling.graph.operators.MeasureTest
import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import ml.sparkling.graph.operators.OperatorsDSL._

class CommunityBasedPartitioning$Test(implicit sc:SparkContext) extends MeasureTest {


  "One component graph " should  " have one partition" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Partition using PSCAN")
    val partitionedGraph: Graph[Int, Int] = CommunityBasedPartitioning.partitionGraphUsing(graph,PSCAN)
    Then("Should compute partitions correctly")
    partitionedGraph.edges.partitions.size  should equal (1)
    graph.unpersist(false)
  }

  "One component graph " should  " have one partition when calculated using DSL" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/4_nodes_full")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Partition using PSCAN")
    val partitionedGraph: Graph[Int, Int] =graph.partitionBy(PSCAN,1)
    Then("Should compute partitions correctly")
    partitionedGraph.edges.partitions.size  should equal (1)
    graph.unpersist(false)
  }

  "Five component graph " should  " have five partitions" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Partition using PSCAN")
    val partitionedGraph: Graph[Int, Int] = CommunityBasedPartitioning.partitionGraphUsing(graph,PSCAN,5)
    Then("Should compute partitions correctly")
    partitionedGraph.edges.partitions.size  should equal (5)
    graph.unpersist(false)
  }

  "Three component graph " should  " have five partitions" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/coarsening_to_3")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Partition using PSCAN")
    val partitionedGraph: Graph[Int, Int] = CommunityBasedPartitioning.partitionGraphUsing(graph,PSCAN,3)
    Then("Should compute partitions correctly")
    partitionedGraph.edges.partitions.size  should equal (3)
    graph.unpersist(false)
  }

  "Change of community method parammeters" should  " be possible" in{
    Given("graph")
    val filePath = getClass.getResource("/graphs/5_nodes_directed")
    val graph:Graph[Int,Int]=loadGraph(filePath.toString)
    When("Partition using PSCAN")
    val partitionedGraph: Graph[Int, Int] = CommunityBasedPartitioning.partitionGraphBy(graph,PSCAN.computeConnectedComponents(_,epsilon = 0),1)
    Then("Should compute partitions correctly")
    partitionedGraph.edges.partitions.size  should equal (1)
    graph.unpersist(false)
  }
} 
Example 65
Source File: PSCANConnectedComponents.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.algorithms.community.pscan

import org.apache.spark.graphx.{EdgeTriplet, Graph, Pregel, VertexId}


class PSCANConnectedComponents(minWeight:Double) extends Serializable{


  def run[VD,ED](graph:Graph[VertexId,Double], maxIterations:Int=Int.MaxValue):Graph[VertexId,Double]={
    val initialMessage = Long.MaxValue
    Pregel(graph, initialMessage,maxIterations = maxIterations)(
    vprog = (_, attr, msg) => math.min(attr, msg),
    sendMsg = sendMessage,
    mergeMsg = (a, b) => math.min(a, b))
  }

  def sendMessage(edge: EdgeTriplet[VertexId, Double]): Iterator[(VertexId, VertexId)] = {
    if(edge.attr > minWeight){
      if(edge.srcAttr<edge.dstAttr){
        Iterator((edge.dstId,edge.srcAttr))
      }else if(edge.dstAttr<edge.srcAttr){
        Iterator((edge.srcId,edge.dstAttr))
      }else{
        Iterator.empty
      }
    }else{
      Iterator.empty
    }
  }
} 
Example 66
Source File: BetweennessHua$Test.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.measures.vertex.betweenness.hua

import java.nio.file.Files

import ml.sparkling.graph.operators.MeasureTest
import ml.sparkling.graph.operators.measures.vertex.betweenness.edmonds.EdmondsBC
import org.apache.commons.io.FileUtils
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.scalatest.tagobjects.Slow


class BetweennessHua$Test (implicit sc: SparkContext) extends MeasureTest {
  val tempDir = Files.createTempDirectory("spark-checkpoint")

  override def beforeAll() = {
    sc.setCheckpointDir(tempDir.toAbsolutePath.toString)
  }

  override def afterAll() = {
    FileUtils.deleteDirectory(tempDir.toFile)
  }

  "Hua betweenness centrality for random graph" should "be correctly calculated" in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/graph_ER_15")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("Computes betweenness")
    val result = HuaBC.computeBC(graph)
    Then("Should calculate betweenness correctly")
    val bcFile = getClass.getResource("/graphs/graph_ER_15_bc")
    val bcCorrectValues = sc.textFile(bcFile.getPath)
      .filter(_.nonEmpty)
      .map(l => { val t = l.split("\t", 2); (t(0).toInt, t(1).toDouble) })
      .sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data}).collect()
    val bcValues = result.sortBy({ case (vId, data) => vId })
      .map({ case (vId, data) => data }).collect()
    bcCorrectValues.zip(bcValues).foreach({ case (a, b) =>
      a should be(b +- 1e-5)
    })

    result.unpersist(false)
  }

  "Hua betweenness centrality for random graph" should "take no longer then Edmonds" taggedAs(Slow) in {
    Given("graph")
    val filePath = getClass.getResource("/graphs/graph_ER_15")
    val graph: Graph[Int, Int] = loadGraph(filePath.toString)
    When("computes betwenness centrality")
    val (_, edmondsTime) = time("Edmonds algorithm for betweenness centrality")(EdmondsBC.computeBC(graph))
    val (_, huaTime) = time("Hua algorithm for betweenness centrality")(HuaBC.computeBC(graph))
    Then("Hua algorithm should be faster")
    huaTime should be <= edmondsTime
  }

} 
Example 67
Source File: Modularity.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.measures.graph

import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.ComponentID
import ml.sparkling.graph.api.operators.measures.{VertexDependentGraphMeasure, GraphIndependentMeasure}
import org.apache.spark.graphx.{EdgeTriplet, VertexRDD, Graph}
import org.apache.spark.rdd.RDD

import scala.reflect.ClassTag


object Modularity extends VertexDependentGraphMeasure[Double,ComponentID]{

   def compute[V<:ComponentID:ClassTag,E:ClassTag](graph: Graph[V, E]): Double = {
     val edgesNum=graph.numEdges.toDouble;
     val edgesCounts: RDD[(V, (Int, Int))] = graph.triplets.flatMap(triplet => {
       if (triplet.srcAttr == triplet.dstAttr) {
         Iterator((triplet.srcAttr, (1, 0)),(triplet.srcAttr, (1, 0)))
       } else {
         Iterator((triplet.srcAttr, (0, 1)),(triplet.dstAttr,(0,1)))
       }
     })
     edgesCounts.aggregateByKey((0,0))(
       (agg:(Int,Int),data:(Int,Int))=>
         (agg,data) match{
           case ((a1,b1),(a2,b2))=>(a1+a2,b1+b2)
         },
     (agg1:(Int,Int),agg2:(Int,Int))=>{
       (agg1,agg2) match{
         case ((a1,b1),(a2,b2))=>(a1+a2,b1+b2)
       }
     }
     ).treeAggregate(0.0)(
       (agg:Double,data:(V,(Int,Int)))=>{
         data match{
           case (_,(edgesFull,edgesSome))=>
             agg+(edgesFull/(2.0*edgesNum))-Math.pow((edgesSome+edgesFull)/(2.0*edgesNum),2)
         }
       },
       (agg1,agg2)=>agg1+agg2
     )

  }

} 
Example 68
Source File: CommunityBasedPartitioning.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.partitioning


import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.{CommunityDetectionAlgorithm, CommunityDetectionMethod, ComponentID}
import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.DefaultPartitionOperator
import org.apache.log4j.Logger
import org.apache.spark.{Partitioner, SparkContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Graph, PartitionID, PartitionStrategy, VertexId}

import scala.reflect.ClassTag


object CommunityBasedPartitioning {
  @transient
  val logger=Logger.getLogger(CommunityBasedPartitioning.getClass())

  def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionMethod[VD,ED],numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={
    val numberOfPartitions=if (numParts== -1) sc.defaultParallelism else numParts
    val communities: Graph[ComponentID, ED] = communityDetectionMethod(graph)
    val numberOfCommunities=communities.vertices.values.countApproxDistinct()
    val (coarsedVertexMap,coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions,numberOfCommunities,communities.vertices)
    val strategy=ByComponentIdPartitionStrategy(coarsedVertexMap,coarsedNumberOfPartitions, DefaultPartitionOperator)
    logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries  and ${coarsedNumberOfPartitions} partitions")
    val out=graph.partitionBy(strategy,numberOfCommunities.toInt).cache()
    out.edges.foreachPartition((_)=>{})
    out.vertices.foreachPartition((_)=>{})
    out
  }


  def partitionGraphUsing[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={
    partitionGraphBy(graph,communityDetectionMethod.detectCommunities[VD,ED](_),numParts)
  }



} 
Example 69
Source File: PSCANBasedPartitioning.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.operators.partitioning

import java.util.UUID

import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.ComponentID
import ml.sparkling.graph.operators.algorithms.community.pscan.PSCAN
import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.{DefaultPartitionOperator, logger}
import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Graph, VertexId}

import scala.collection.mutable
import scala.reflect.ClassTag


object PSCANBasedPartitioning {

  @transient
  val logger=Logger.getLogger(PSCANBasedPartitioning.getClass())

  def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],numberOfPartitions:Int, maxIterations:Int = Int.MaxValue)(implicit sc:SparkContext): Graph[VD, ED] ={
    val (numberOfCommunities: VertexId,  coarsedVertexMap: Map[VertexId, Int], coarsedNumberOfPartitions: Int, strategy: ByComponentIdPartitionStrategy) = buildPartitioningStrategy(graph, numberOfPartitions, maxIterations = maxIterations)
    logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries and ${coarsedNumberOfPartitions} partitions (before ${numberOfCommunities})")
    val out=graph.partitionBy(strategy,numberOfPartitions).cache()
    out.edges.foreachPartition((_)=>{})
    out.triplets.foreachPartition((_)=>{})
    out.vertices.foreachPartition((_)=>{})
    out
  }


  def buildPartitioningStrategy[ED: ClassTag, VD: ClassTag](graph: Graph[VD, ED], numberOfPartitions: Int, maxIterations:Int = Int.MaxValue)(implicit sc:SparkContext) = {
    val (numberOfCommunities: VertexId, coarsedVertexMap: Map[VertexId, Int], coarsedNumberOfPartitions: Int) = precomputePartitions(graph, numberOfPartitions, maxIterations = maxIterations)
    logger.info(s"Requested $numberOfPartitions partitions, computed $coarsedNumberOfPartitions")
    val strategy = ByComponentIdPartitionStrategy(coarsedVertexMap, numberOfPartitions, DefaultPartitionOperator)
    (numberOfCommunities, coarsedVertexMap, coarsedNumberOfPartitions, strategy)
  }

  def precomputePartitions[ED: ClassTag, VD: ClassTag](graph: Graph[VD, ED], numberOfPartitions: Int, maxIterations:Int = Int.MaxValue)(implicit sc:SparkContext) = {
    logger.info("Computing components using PSCAN")
    val (communities, numberOfCommunities): (Graph[ComponentID, ED], VertexId) = PSCAN.computeConnectedComponentsUsing(graph, numberOfPartitions, maxIterations = maxIterations)
    val computationData=communities.vertices.map(t=>t).localCheckpoint()
    logger.info("Components computed!")
    val (coarsedVertexMap, coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions, numberOfCommunities, computationData)
    (numberOfCommunities, coarsedVertexMap, coarsedNumberOfPartitions)
  }
} 
Example 70
Source File: VertexMeasureConfigurationTest.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.api.operators.measures

import ml.sparkling.graph.api.operators.IterativeComputation.BucketSizeProvider
import org.apache.spark.graphx.Graph
import org.scalatest.{FlatSpec, GivenWhenThen}



class VertexMeasureConfigurationTest extends FlatSpec with GivenWhenThen  {

  "Creation without parameters" should "be possible" in{
    VertexMeasureConfiguration()
  }

  "Creation with undirected flag" should "be possible" in{
    Given("Directed flag")
    val flag=false
    When("Configuration creation")
    VertexMeasureConfiguration(treatAsUndirected = flag )
  }

  "Creation with bucket size provider" should "be possible" in{
    Given("Bucker size provider")
    val provider:BucketSizeProvider[Long,Long]=(g:Graph[Long,Long])=>1l
    When("Configuration creation")
    VertexMeasureConfiguration(bucketSizeProvider = provider)
  }

  "Creation with bucket size provider and directed flag" should "be possible" in{
    Given("Bucker size provider")
    val provider:BucketSizeProvider[Long,Long]=(g:Graph[Long,Long])=>1l
    When("Configuration creation")
    VertexMeasureConfiguration( false, provider)
  }
} 
Example 71
Source File: GraphLoading.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.api.loaders

import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph

import scala.reflect.ClassTag


object GraphLoading {
  trait GraphLoader[VD,ED]{
    def load(parameters:List[Parameter])(implicit sc:SparkContext):Graph[VD,ED]
  }

  trait TypedGraphLoader[VD2,ED2] extends GraphLoader[VD2,ED2]{
    def load[VD:ClassTag,ED:ClassTag](parameters:List[Parameter])(implicit sc:SparkContext):Graph[VD,ED]
  }
  
  trait FromPathLoader[VD,ED] {
    def apply(path:String):GraphLoader[VD,ED]
  }
  
  object LoadGraph{
    def from[VD:ClassTag,ED:ClassTag](graphLoader: GraphLoader[VD,ED]):GraphLoaderConfigurator[VD,ED]={
      GraphLoaderConfigurator(List.empty,graphLoader)
    }
  }
  
  case  class GraphLoaderConfigurator[VD:ClassTag,ED:ClassTag](parameters:List[Parameter], loader:GraphLoader[_,_]){
    def using(parameter:Parameter)={
      GraphLoaderConfigurator[VD,ED](parameter::parameters,loader)
    }
    
    def load[VD:ClassTag,ED:ClassTag]()(implicit sc:SparkContext): Graph[VD,ED] ={
      loader match{
        case typed:TypedGraphLoader[_,_]=>typed.load[VD,ED](parameters)
        case normal:GraphLoader[VD @unchecked,ED @unchecked] => normal.load(parameters)
      }
    }

  }

  trait Parameter
  
  trait WithValueParameter[V] extends Parameter{
    def value:V
  }


} 
Example 72
Source File: ShortestPathLengthsFromCSV.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.examples

import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes
import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes._
import ml.sparkling.graph.operators.algorithms.shortestpaths.ShortestPathsAlgorithm
import ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors.fastutils.FastUtilWithDistance.DataMap
import ml.sparkling.graph.operators.predicates.AllPathPredicate
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Graph, VertexId}

import scala.collection.JavaConversions._

object ShortestPathLengthsFromCSV extends ExampleApp {
def body()={
  val shortestPaths =if(bucketSize == -1l)
    ShortestPathsAlgorithm.computeShortestPathsLengths(partitionedGraph,AllPathPredicate,treatAsUndirected)
  else
    ShortestPathsAlgorithm.computeShortestPathsLengthsIterative(partitionedGraph,(g:Graph[_,_])=>bucketSize,treatAsUndirected)
  val size: Broadcast[VertexId] =ctx.broadcast(partitionedGraph.numVertices)
  partitionedGraph.outerJoinVertices(shortestPaths.vertices)(Util.dataTransformFunction(size) _).vertices.values.saveAsTextFile(out)
  ctx.stop()
}
}


private object Util{
  def dataTransformFunction(size: Broadcast[VertexId])(vId: VertexId,oldValue: String,pathsOption: Option[_ >: DataMap <: JMap[JLong, JDouble]])={
    pathsOption.flatMap((paths)=>{
      var entries=paths.entrySet().toList.sortBy(_.getKey)
      val out=new StringBuilder()
      out++=s"${oldValue},"
      var a = 0l
      while (a < size.value) {
        if (entries.size > 0 && a == entries.head.getKey) {
          out ++= s"${entries.head.getValue},"
          entries = entries.drop(1)
        }
        else {
          out ++= "0,"
        }
        a += 1l
      }
      out.setLength(out.length - 1)
      Option(out.toString())
    }).getOrElse(oldValue)
  }
} 
Example 73
Source File: GraphDescriptionFromCSV.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.examples

import ml.sparkling.graph.api.operators.measures.VertexMeasureConfiguration
import ml.sparkling.graph.experiments.describe.GraphDescriptor._
import org.apache.log4j.Logger
import org.apache.spark.graphx.Graph



object GraphDescriptionFromCSV extends ExampleApp {
  def body()={
  val configuration = if (bucketSize == -1l) {
    val graphSize=1000l
    logger.info(s"BUCKET SIZE WILL BE EQUAL TO 1000!!")
    VertexMeasureConfiguration[String,Double](treatAsUndirected,(g:Graph[String,Double])=>graphSize)
  }
  else
    VertexMeasureConfiguration[String,Double](treatAsUndirected,(g:Graph[String,Double])=>bucketSize)
    val groupedGraph=partitionedGraph.groupEdges((a,b)=>a)
    groupedGraph.describeGraphToDirectory(out, configuration)
  ctx.stop()
  }
}