org.apache.spark.graphx.Edge Scala Examples

The following examples show how to use org.apache.spark.graphx.Edge. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: FindInfluencer.scala    From spark-graphx-twitter   with Apache License 2.0 5 votes vote down vote up
package com.knoldus.spark.graphx.example

import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object FindInfluencer {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("Twittter Influencer").setMaster("local[*]")
    val sparkContext = new SparkContext(conf)
    sparkContext.setLogLevel("ERROR")

    val twitterData = sparkContext.textFile("src/main/resources/twitter-graph-data.txt")

    val followeeVertices: RDD[(VertexId, String)] = twitterData.map(_.split(",")).map { arr =>
      val user = arr(0).replace("((", "")
      val id = arr(1).replace(")", "")
      (id.toLong, user)
    }

    val followerVertices: RDD[(VertexId, String)] = twitterData.map(_.split(",")).map { arr =>
      val user = arr(2).replace("(", "")
      val id = arr(3).replace("))", "")
      (id.toLong, user)
    }

    val vertices = followeeVertices.union(followerVertices)
    val edges: RDD[Edge[String]] = twitterData.map(_.split(",")).map { arr =>
      val followeeId = arr(1).replace(")", "").toLong
      val followerId = arr(3).replace("))", "").toLong
      Edge(followeeId, followerId, "follow")
    }

    val defaultUser = ("")
    val graph = Graph(vertices, edges, defaultUser)

    val subGraph = graph.pregel("", 2, EdgeDirection.In)((_, attr, msg) =>
      attr + "," + msg,
      triplet => Iterator((triplet.srcId, triplet.dstAttr)),
      (a, b) => (a + "," + b))

    val lengthRDD = subGraph.vertices.map(vertex => (vertex._1, vertex._2.split(",").distinct.length - 2)).max()(new Ordering[Tuple2[VertexId, Int]]() {
      override def compare(x: (VertexId, Int), y: (VertexId, Int)): Int =
        Ordering[Int].compare(x._2, y._2)
    })

    val userId = graph.vertices.filter(_._1 == lengthRDD._1).map(_._2).collect().head
    println(userId + " has maximum influence on network with " + lengthRDD._2 + " influencers.")

    sparkContext.stop()
  }
} 
Example 2
Source File: SparkPersistence.scala    From csb   with GNU General Public License v3.0 5 votes vote down vote up
package edu.msstate.dasi.csb.persistence

import java.io.File

import edu.msstate.dasi.csb.model.{EdgeData, VertexData}
import edu.msstate.dasi.csb.sc
import edu.msstate.dasi.csb.util.Util
import org.apache.hadoop.fs.FileUtil
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.storage.StorageLevel

object SparkPersistence extends GraphPersistence {
  private val vertices_suffix = "_vertices"
  private val edges_suffix = "_edges"

  
  def saveAsText(graph: Graph[VertexData, EdgeData], graphName: String, overwrite: Boolean = false): Unit = {
    val verticesPath = graphName + vertices_suffix
    val verticesTmpPath = "__" + verticesPath
    val edgesPath = graphName + edges_suffix
    val edgesTmpPath = "__" + edgesPath

    if (overwrite) {
      FileUtil.fullyDelete(new File(verticesPath))
      FileUtil.fullyDelete(new File(edgesPath))
    }

    graph.vertices.saveAsTextFile(verticesTmpPath)
    Util.merge(verticesTmpPath, verticesPath)
    FileUtil.fullyDelete(new File(verticesTmpPath))

    graph.edges.saveAsTextFile(edgesTmpPath)
    Util.merge(edgesTmpPath, edgesPath)
    FileUtil.fullyDelete(new File(edgesTmpPath))
  }
} 
Example 3
Source File: PageRank.scala    From MaxCompute-Spark   with Apache License 2.0 5 votes vote down vote up
package com.aliyun.odps.spark.examples.graphx

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object PageRank {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("PageRank")
      .getOrCreate()
    val sc = spark.sparkContext

    // build vertices
    val users: RDD[(VertexId, Array[String])] = sc.parallelize(List(
      "1,BarackObama,Barack Obama",
      "2,ladygaga,Goddess of Love",
      "3,jeresig,John Resig",
      "4,justinbieber,Justin Bieber",
      "6,matei_zaharia,Matei Zaharia",
      "7,odersky,Martin Odersky",
      "8,anonsys"
    ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail)))

    // build edges
    val followers: RDD[Edge[Double]] = sc.parallelize(Array(
      Edge(2L, 1L, 1.0),
      Edge(4L, 1L, 1.0),
      Edge(1L, 2L, 1.0),
      Edge(6L, 3L, 1.0),
      Edge(7L, 3L, 1.0),
      Edge(7L, 6L, 1.0),
      Edge(6L, 7L, 1.0),
      Edge(3L, 7L, 1.0)
    ))

    // build graph
    val followerGraph: Graph[Array[String], Double] = Graph(users, followers)

    // restrict the graph to users with usernames and names
    val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2)

    // compute PageRank
    val pageRankGraph = subgraph.pageRank(0.001)

    // get attributes of the top pagerank users
    val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) {
      case (uid, attrList, Some(pr)) => (pr, attrList.toList)
      case (uid, attrList, None) => (0.0, attrList.toList)
    }

    println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n"))
  }
} 
Example 4
Source File: PageRank.scala    From MaxCompute-Spark   with Apache License 2.0 5 votes vote down vote up
package com.aliyun.odps.spark.examples.graphx

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object PageRank {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("PageRank")
    val sc = new SparkContext(conf)

    // build vertices
    val users: RDD[(VertexId, Array[String])] = sc.parallelize(List(
      "1,BarackObama,Barack Obama",
      "2,ladygaga,Goddess of Love",
      "3,jeresig,John Resig",
      "4,justinbieber,Justin Bieber",
      "6,matei_zaharia,Matei Zaharia",
      "7,odersky,Martin Odersky",
      "8,anonsys"
    ).map(line => line.split(",")).map(parts => (parts.head.toLong, parts.tail)))

    // build edges
    val followers: RDD[Edge[Double]] = sc.parallelize(Array(
      Edge(2L, 1L, 1.0),
      Edge(4L, 1L, 1.0),
      Edge(1L, 2L, 1.0),
      Edge(6L, 3L, 1.0),
      Edge(7L, 3L, 1.0),
      Edge(7L, 6L, 1.0),
      Edge(6L, 7L, 1.0),
      Edge(3L, 7L, 1.0)
    ))

    // build graph
    val followerGraph: Graph[Array[String], Double] = Graph(users, followers)

    // restrict the graph to users with usernames and names
    val subgraph = followerGraph.subgraph(vpred = (vid, attr) => attr.size == 2)

    // compute PageRank
    val pageRankGraph = subgraph.pageRank(0.001)

    // get attributes of the top pagerank users
    val userInfoWithPageRank = subgraph.outerJoinVertices(pageRankGraph.vertices) {
      case (uid, attrList, Some(pr)) => (pr, attrList.toList)
      case (uid, attrList, None) => (0.0, attrList.toList)
    }

    println(userInfoWithPageRank.vertices.top(5)(Ordering.by(_._2._1)).mkString("\n"))
  }
} 
Example 5
Source File: GodwinTest.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.timeseries.graph

import io.gzet.test.SparkFunSuite
import org.apache.log4j.{Logger, Level}
import org.apache.spark.graphx.{Graph, Edge}
import org.apache.spark.rdd.RDD

import scala.io.Source

class GodwinTest extends SparkFunSuite {

  Logger.getLogger("akka").setLevel(Level.OFF)
  Logger.getLogger("org").setLevel(Level.OFF)

  def buildEdges() = {
    Source.fromInputStream(getClass.getResourceAsStream("/edges.csv")).getLines().drop(1).map(s => {
      val Array(source, target, weight) = s.split(",")
      Edge(source.toLong, target.toLong, weight.toDouble)
    }).toList
  }

  localTest("Test Random Walks") { sc =>
    val edges: RDD[Edge[Double]] = sc.parallelize(buildEdges(), 1)
    val godwin = new Godwin(Seq(16))
    val walks = godwin.randomWalks(Graph.fromEdges(edges, 0L), 4).collect().sortBy(_._2)
    println(walks.map(_._1).mkString(" -> "))
    walks.last._1 should be(16)
  }

} 
Example 6
Source File: GzetCommunitiesTest.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.community

import io.gzet.community.clustering.wcc.WCCDetection
import io.gzet.test.SparkFunSuite
import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.{Graph, Edge}

import scala.io.Source

class GzetCommunitiesTest extends SparkFunSuite {

  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  localTest("WCC communities") { spark =>

    val lines = Source.fromInputStream(getClass.getResourceAsStream("/local-edges.csv")).getLines().zipWithIndex.filter(_._2 > 0).map(_._1).toSeq
    val sc = spark.sparkContext
    val edges = sc.parallelize(lines).map({ line =>
      val a = line.split(",").map(_.toLong).sorted
      Edge(a.head, a.last, 1L)
    }).distinct()

    val graph = Graph.fromEdges(edges, 0L)

    graph.triplets.take(2).foreach(println)
    val communities = new WCCDetection(1).run(graph, sc)
    communities.map(_._2 -> 1).reduceByKey(_+_).collectAsMap() should be(Map(5L -> 5, 15L -> 6, 21L -> 5))
  }
} 
Example 7
Source File: StoryBatchDedup.scala    From Mastering-Spark-for-Data-Science   with MIT License 5 votes vote down vote up
package io.gzet.story

import io.gzet.story.model.{Content, Article}
import org.apache.spark.graphx.{Graph, Edge}
import org.apache.spark.{Logging, SparkConf, SparkContext}
import io.gzet.story.util.SimhashUtils._
import com.datastax.spark.connector._

object StoryBatchDedup extends SimpleConfig with Logging {

  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf().setAppName("Story Extractor")
    val sc = new SparkContext(sparkConf)

    val simhashRDD = sc.cassandraTable[Article]("gzet", "articles").zipWithIndex().map({ case (a, id) =>
      ((id, Content(a.url, a.title, a.body)), a.hash)
    })
    Set(0)

    val duplicateTupleRDD = simhashRDD.flatMap({ case ((id, content), simhash) =>
      searchmasks.map({ mask =>
        (simhash ^ mask, id)
      })
    }).groupByKey()

    val edgeRDD = duplicateTupleRDD.values.flatMap({ it =>
      val list = it.toList
      for (x <- list; y <- list) yield (x, y)
    }).filter({ case (x, y) =>
      x != y
    }).distinct().map({case (x, y) =>
      Edge(x, y, 0)
    })

    val duplicateRDD = Graph.fromEdges(edgeRDD, 0L)
      .connectedComponents()
      .vertices
      .join(simhashRDD.keys)
      .values

    duplicateRDD.sortBy(_._1).collect().foreach({ case (story, content) =>
      println(story + "\t" + content.title)
    })

  }

} 
Example 8
Source File: Neo4jGraphScalaTSE.scala    From neo4j-spark-connector   with Apache License 2.0 5 votes vote down vote up
package org.neo4j.spark

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.junit.Assert._
import org.junit._

import scala.collection.JavaConverters._

object Neo4jGraphScalaTSE {

}


class Neo4jGraphScalaTSE extends SparkConnectorScalaBaseTSE {
  val FIXTURE: String = "CREATE (s:A {a:0})-[r:REL {foo:'bar'}]->(t:B {b:1}) RETURN id(s) AS source, id(t) AS target"

  private var source: Long = _
  private var target: Long = _

  @Before
  @throws[Exception]
  def setUp {
    val map = SparkConnectorScalaSuiteIT.session().run(FIXTURE).single()
      .asMap()
    source = map.get("source").asInstanceOf[Long]
    target = map.get("target").asInstanceOf[Long]
  }

  private def assertGraph(graph: Graph[_, _], expectedNodes: Long, expectedRels: Long) = {
    assertEquals(expectedNodes, graph.vertices.count)
    assertEquals(expectedRels, graph.edges.count)
  }

  @Test def runCypherQueryWithParams {
    val data = List(Map("id"->1,"name"->"Test").asJava).asJava
    Executor.execute(sc, "UNWIND $data as row CREATE (n:Test {id:row.id}) SET n.name = row.name", Map(("data",data)))
  }
  @Test def runMatrixQuery {
    val graph = Neo4jGraph.loadGraph(sc, "A", Seq.empty, "B")
    assertGraph(graph, 2, 1)
  }

  @Test def saveGraph {
    val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(source,target,42L)))
    val graph = Graph.fromEdges(edges,-1)
    assertGraph(graph, 2, 1)
    Neo4jGraph.saveGraph(sc,graph,null,("REL","test"))
    assertEquals(42L, SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL]->(:B) RETURN rel.test as prop").single().get("prop").asLong())
  }

  @Test def saveGraphMerge {
    val edges : RDD[Edge[Long]] = sc.makeRDD(Seq(Edge(source,target,42L)))
    val graph = Graph.fromEdges(edges,13L)
    assertGraph(graph, 2, 1)
    Neo4jGraph.saveGraph(sc,graph,"value",("FOOBAR","test"),Option("Foo","id"),Option("Bar","id"),merge = true)
    assertEquals(Map("fid"->source,"bid"->target,"rv"->42L,"fv"->13L,"bv"->13L).asJava,SparkConnectorScalaSuiteIT.session().run("MATCH (foo:Foo)-[rel:FOOBAR]->(bar:Bar) RETURN {fid: foo.id, fv:foo.value, rv:rel.test,bid:bar.id,bv:bar.value} as data").single().get("data").asMap())
  }
  @Test def saveGraphByNodeLabel {
    val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(0,1,42L)))
    val graph = Graph.fromEdges(edges,-1)
    assertGraph(graph, 2, 1)
    Neo4jGraph.saveGraph(sc,graph,null,("REL","test"),Option(("A","a")),Option(("B","b")))
    assertEquals(42L,SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL]->(:B) RETURN rel.test as prop").single().get("prop").asLong())
  }
  @Test def mergeGraphByNodeLabel {
    val edges : RDD[Edge[VertexId]] = sc.makeRDD(Seq(Edge(source,target,42L)))
    val graph = Graph.fromEdges(edges,-1)
    assertGraph(graph, 2, 1)
    Neo4jGraph.saveGraph(sc,graph,null,("REL2","test"),merge = true)
    assertEquals(42L,SparkConnectorScalaSuiteIT.session().run("MATCH (:A)-[rel:REL2]->(:B) RETURN rel.test as prop").single().get("prop").asLong())
  }

  @Test def saveGraphNodes {
    val nodes : RDD[(VertexId, Long)] = sc.makeRDD(Seq((source,10L),(target,20L)))
    val edges : RDD[Edge[Long]] = sc.makeRDD(Seq())
    val graph = Graph[Long,Long](nodes,edges,-1)
    assertGraph(graph, 2, 0)
    Neo4jGraph.saveGraph(sc,graph,"prop")
    assertEquals(10L,SparkConnectorScalaSuiteIT.session().run(s"MATCH (a:A) WHERE id(a) = $source RETURN a.prop as prop").single().get("prop").asLong())
    assertEquals(20L,SparkConnectorScalaSuiteIT.session().run(s"MATCH (b:B) WHERE id(b) = $target RETURN b.prop as prop").single().get("prop").asLong())
  }
} 
Example 9
Source File: PairwiseBPSuite.scala    From sandpiper   with Apache License 2.0 5 votes vote down vote up
package sparkle.graph

import org.apache.spark.graphx.{Edge, Graph}
import org.apache.spark.rdd.RDD
import org.scalatest.FunSuite
import sparkle.util.LocalSparkContext

class PairwiseBPSuite  extends FunSuite with LocalSparkContext {

  test("Pairwise BP test") {
    // test from the lectures EECS course 6.869, Bill Freeman and Antonio Torralba.
    // Chapter 7.3.5 Numerical example.

    withSpark { sc =>
      val vertices: RDD[(Long, PVertex)] = sc.parallelize(Seq(
        (1L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))),
        (2L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))),
        (3L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 1.0).map(math.log)))),
        (4L, PVertex(Variable(Array(0.0, 0.0)), Variable(Array(1.0, 0.0).map(math.log)))))
      )
      val edges = sc.parallelize(Seq(
        Edge(1L, 2L, PEdge(Factor(Array(2, 2), Array(1.0, 0.9, 0.9, 1.0).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0)))),
        Edge(2L, 3L, PEdge(Factor(Array(2, 2), Array(0.1, 1.0, 1.0, 0.1).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0)))),
        Edge(2L, 4L, PEdge(Factor(Array(2, 2), Array(1.0, 0.1, 0.1, 1.0).map(math.log)), Variable(Array(0.0, 0.0)), Variable(Array(0.0, 0.0))))
      ))
      val graph = Graph(vertices, edges)
      val bpGraph = PairwiseBP(graph)
      val trueProbabilities = Seq(
        1L -> (1.0 / 2.09 * 1.09, 1.0 / 2.09 * 1.0),
        2L -> (1.0 / 1.1 * 1.0, 1.0 / 1.1 * 0.1),
        3L -> (1.0 / 1.21 * 0.2, 1.0 / 1.21 * 1.01),
        4L -> (1.0, 0.0)).sortBy { case (vid, _) => vid }
      val calculatedProbabilities = bpGraph.vertices.collect().sortBy { case (vid, _) => vid }
      val eps = 10e-5
      calculatedProbabilities.zip(trueProbabilities).foreach {
        case ((_, vertex), (_, (trueP0, trueP1))) =>
          assert(trueP0 - vertex.belief.exp().cloneValues(0) < eps && trueP1 - vertex.belief.exp().cloneValues(1) < eps)
      }
    }

  }

  test("Pariwise BP test with file") {
    withSpark { sc =>
      val graph = PairwiseBP.loadPairwiseGraph(sc, "data/vertex4.txt", "data/edge4.txt")
      val bpGraph = PairwiseBP(graph)
      val trueProbabilities = Seq(
        1L -> (1.0 / 2.09 * 1.09, 1.0 / 2.09 * 1.0),
        2L -> (1.0 / 1.1 * 1.0, 1.0 / 1.1 * 0.1),
        3L -> (1.0 / 1.21 * 0.2, 1.0 / 1.21 * 1.01),
        4L -> (1.0, 0.0)).sortBy { case (vid, _) => vid }
      val calculatedProbabilities = bpGraph.vertices.collect().sortBy { case (vid, _) => vid }
      val eps = 10e-5
      calculatedProbabilities.zip(trueProbabilities).foreach {
        case ((_, vertex), (_, (trueP0, trueP1))) =>
          assert(trueP0 - vertex.belief.exp().cloneValues(0) < eps && trueP1 - vertex.belief.exp().cloneValues(1) < eps)
      }
    }
  }
} 
Example 10
Source File: EdgeProviders.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.loaders.csv.providers

import ml.sparkling.graph.loaders.csv.types.CSVTypes.EdgeAttributeExtractor
import ml.sparkling.graph.loaders.csv.types.Types.ToVertexId
import ml.sparkling.graph.loaders.csv.types.{CSVTypes, Types}
import ml.sparkling.graph.loaders.csv.utils.DefaultTransformers
import ml.sparkling.graph.loaders.csv.utils.DefaultTransformers.{defaultEdgeAttribute, numberToVertexId}
import org.apache.spark.graphx.Edge
import org.apache.spark.sql.Row

import scala.reflect.ClassTag


object EdgeProviders {

  type TwoColumnsMakeEdgeProvider[VD,ED]=(Int,Int,Row, ToVertexId[VD], EdgeAttributeExtractor[ED])=>Seq[Edge[ED]]

  def twoColumnsMakesEdge[VD:ClassTag,ED:ClassTag](id1:Int,
                          id2:Int,row:Row,
                          columnToId:ToVertexId[VD],
                          edgeAttributeProvider:EdgeAttributeExtractor[ED]):Seq[Edge[ED]]={
   Seq(Edge(columnToId(row.getAs(id1)),columnToId(row.getAs(id2)),edgeAttributeProvider(row)))
  }

  def twoColumnsMakesEdge[VD:ClassTag](id1:Int,
                                 id2:Int,
                                 row:Row):Seq[Edge[Double]]={
    twoColumnsMakesEdge(id1,id2,row,numberToVertexId _,defaultEdgeAttribute _)
  }

} 
Example 11
Source File: GraphProviders.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.loaders.csv.providers

import ml.sparkling.graph.loaders.csv.types.Types
import ml.sparkling.graph.loaders.csv.types.Types.ToVertexId
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.sql.SparkSession;
import scala.reflect.ClassTag


object GraphProviders {
  val defaultStorageLevel=StorageLevel.MEMORY_ONLY
  def simpleGraphBuilder[VD: ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None,
                                                     vertexProvider: Row => Seq[(VertexId, VD)],
                                                     edgeProvider: Row => Seq[Edge[ED]],
                                                     edgeStorageLevel: StorageLevel = defaultStorageLevel,
                                                     vertexStorageLevel: StorageLevel =defaultStorageLevel)
                                                    (dataFrame: DataFrame): Graph[VD, ED] = {

    def mapRows[MT: ClassTag](mappingFunction: (Row) => Seq[MT]): RDD[MT] = {
      dataFrame.rdd.mapPartitionsWithIndex((id, rowIterator) => {
        rowIterator.flatMap { case row => mappingFunction(row) }
      })
    }

    val vertices: RDD[(VertexId, VD)] = mapRows(vertexProvider)
    val edges: RDD[Edge[ED]] = mapRows(edgeProvider)
    defaultVertex match{
      case None => Graph(vertices,edges,edgeStorageLevel=edgeStorageLevel,vertexStorageLevel=vertexStorageLevel)
      case Some(defaultVertexValue)=> Graph(vertices,edges,defaultVertexValue,edgeStorageLevel,vertexStorageLevel)
    }

  }

  def indexedGraphBuilder[VD:ClassTag, ED: ClassTag](defaultVertex: Option[VD]=None,
                                                      vertexProvider: (Row, ToVertexId[VD]) => Seq[(VertexId, VD)],
                                                      edgeProvider: (Row, ToVertexId[VD]) => Seq[Edge[ED]],
                                                      columnsToIndex: Seq[Int],
                                                      edgeStorageLevel: StorageLevel = defaultStorageLevel,
                                                      vertexStorageLevel: StorageLevel = defaultStorageLevel)
                                                     (dataFrame: DataFrame): Graph[VD, ED] = {
    val index = dataFrame.rdd.flatMap(row => columnsToIndex.map(row(_))).distinct().zipWithUniqueId().collect().toMap
    def extractIdFromIndex(vertex: VD) = index(vertex)
    simpleGraphBuilder(defaultVertex,
      vertexProvider(_: Row, extractIdFromIndex _),
      edgeProvider(_: Row, extractIdFromIndex _),
      edgeStorageLevel,
      vertexStorageLevel)(dataFrame)

  }
} 
Example 12
Source File: GraphMLLoader.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.loaders.graphml

import com.databricks.spark.xml._
import ml.sparkling.graph.loaders.graphml.GraphMLFormat._
import ml.sparkling.graph.loaders.graphml.GraphMLTypes.TypeHandler
import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext, SparkSession}

import scala.collection.mutable
import scala.util.Try


  def loadGraphFromML(path: String)(implicit sc: SparkContext): Graph[ValuesMap, ValuesMap] = {
    val sparkSession=SparkSession.builder().getOrCreate();

    val graphDataFrame = sparkSession.sqlContext.read
      .format("com.databricks.spark.xml")
      .option("attributePrefix","@")
      .option("valueTag","#VALUE")
      .option("rowTag",graphTag).load(path).rdd

    val keys =sparkSession.sqlContext.read
      .format("com.databricks.spark.xml")
      .option("attributePrefix","@")
      .option("valueTag","#VALUE")
      .option("rowTag",graphMLTag).load(path).rdd
      .flatMap(r => Try(r.getAs[mutable.WrappedArray[Row]](keyTag).toArray).getOrElse(Array.empty))

    val nodesKeys = keys
      .filter(r => r.getAs[String](forAttribute) == nodeTag)
    val edgeKeys = keys
      .filter(r => r.getAs[String](forAttribute) == edgeTag)

    val nodeAttrHandlers = createAttrHandlersFor(nodesKeys)
    val edgeAttrHandlers = createAttrHandlersFor(edgeKeys)

    val verticesWithData = graphDataFrame.flatMap(r => r.getAs[Any](nodeTag) match {
      case data: mutable.WrappedArray[Row@unchecked] => data.array
      case data: Row => Array(data)
    })

    val verticesIndex = verticesWithData.map(r => r.getAs[String](idAttribute)).zipWithUniqueId().collect().toMap

    val vertices: RDD[(VertexId, Map[String, Any])] = verticesWithData
      .map(
        r => (verticesIndex(r.getAs[String](idAttribute)), extractAttributesMap(nodeAttrHandlers, r))
      )

    val edgesRows = graphDataFrame.flatMap(r => r.getAs[Any](edgeTag) match {
      case data: mutable.WrappedArray[Row@unchecked] => data.array
      case data: Row => Array(data)
    })
      .map(r => Edge(
        verticesIndex(r.getAs[String](sourceAttribute)),
        verticesIndex(r.getAs[String](targetAttribute)),
        extractAttributesMap(edgeAttrHandlers, r)
      ))
    Graph(vertices, edgesRows)
  }

  def extractAttributesMap(attrHandlers: Map[String, GraphMLAttribute], r: Row): Map[String, Any] = {
    Try(r.getAs[mutable.WrappedArray[Row]](dataTag)).toOption.map(
      _.map(r => {
        val attribute = attrHandlers(r.getAs[String](keyAttribute))
        (attribute.name, attribute.handler(r.getAs[String](tagValue)))
      }).toMap
    ).getOrElse(Map.empty) + ("id" -> r.getAs[String](idAttribute))
  }

  def createAttrHandlersFor(keys: RDD[Row]): Map[String, GraphMLAttribute] = {
    keys
      .map(r => (r.getAs[String](idAttribute), GraphMLAttribute(r.getAs[String](nameAttribute), GraphMLTypes(r.getAs[String](typeAttribute)))))
      .collect().toMap
  }
} 
Example 13
Source File: AffinityPropagationSuite.scala    From SparkAffinityPropagation   with MIT License 5 votes vote down vote up
package org.viirya.spark.ml

import scala.collection.mutable

import org.scalatest.{BeforeAndAfterAll, FunSuite, Suite}

import org.viirya.spark.ml.AffinityPropagation._

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.{Edge, Graph}

class AffinityPropagationSuite extends FunSuite with BeforeAndAfterAll { self: Suite =>
  @transient var sc: SparkContext = _

  override def beforeAll() {
    super.beforeAll()
    val conf = new SparkConf()
      .setMaster("local[2]")
      .setAppName("AffinityPropagationUnitTest")
    sc = new SparkContext(conf)
  }

  override def afterAll() {
    try {
      if (sc != null) {
        sc.stop()
      }
      sc = null
    } finally {
      super.afterAll()
    }
  }  

  test("affinity propagation") {
    
    val similarities = Seq[(Long, Long, Double)](
      (0, 1, 1.0), (1, 0, 1.0), (0, 2, 1.0), (2, 0, 1.0), (0, 3, 1.0), (3, 0, 1.0),
      (1, 2, 1.0), (2, 1, 1.0), (2, 3, 1.0), (3, 2, 1.0))
    val expected = Array(
      Array(0.0,     1.0/3.0, 1.0/3.0, 1.0/3.0),
      Array(1.0/2.0,     0.0, 1.0/2.0,     0.0),
      Array(1.0/3.0, 1.0/3.0,     0.0, 1.0/3.0),
      Array(1.0/2.0,     0.0, 1.0/2.0,     0.0))
    val s = constructGraph(sc.parallelize(similarities, 2), true, false)
    s.edges.collect().foreach { case Edge(i, j, x) =>
      assert(math.abs(x.similarity - expected(i.toInt)(j.toInt)) < 1e-14)
    }
  }
} 
Example 14
Source File: ZombieExample.scala    From spark_training   with Apache License 2.0 5 votes vote down vote up
package com.malaska.spark.training.graph

import org.apache.log4j.{Level, Logger}
import org.apache.spark.graphx.{Edge, EdgeDirection, Graph, _}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession


object ZombieExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args:Array[String]): Unit = {
    val vertexJsonFile = args(0)
    val edgeJsonFile = args(1)

    val isLocal = true

    val sparkSession = if (isLocal) {
      SparkSession.builder
        .master("local")
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .config("spark.driver.host","127.0.0.1")
        .config("spark.sql.parquet.compression.codec", "gzip")
        .enableHiveSupport()
        .getOrCreate()
    } else {
      SparkSession.builder
        .appName("my-spark-app")
        .config("spark.some.config.option", "config-value")
        .enableHiveSupport()
        .getOrCreate()
    }
    println("---")

    import sparkSession.implicits._

    val vectorDs = sparkSession.read.json(vertexJsonFile).as[JsonVertex]
    val edgeDs = sparkSession.read.json(edgeJsonFile).as[JsonEdge]

    val vectorRdd:RDD[(VertexId, ZombieStats)] = vectorDs.rdd.map(r => {
      (r.vertex_id.toLong, new ZombieStats(r.is_zombie.equals("yes"), r.time_alive))
    })

    val edgeRdd = edgeDs.rdd.map(r => {
      new Edge[String](r.src, r.dst, r.edge_type)
    })

    val defaultUser = new ZombieStats(false, 0)

    val graph = Graph(vectorRdd, edgeRdd, defaultUser)

    val zombieResults = graph.pregel[Long](0, 30, EdgeDirection.Either)(
      (vertexId, zombieState, message) => {
        if (message > 0 && !zombieState.isZombie) {
          new ZombieStats(true, message)
        } else {
          zombieState
        }
      }, triplet => {
        if (triplet.srcAttr.isZombie && !triplet.dstAttr.isZombie) {
          Iterator((triplet.dstId, triplet.srcAttr.lengthOfLife + 1l))
        } else if (triplet.dstAttr.isZombie && !triplet.srcAttr.isZombie) {
          Iterator((triplet.srcId, triplet.dstAttr.lengthOfLife + 1l))
        } else {
          Iterator.empty
        }
      }, (a, b) => Math.min(a, b))

    println("ZombieBite")
    zombieResults.vertices.collect().sortBy(r => r._1).foreach(r => {
      println("vertexId:" + r._1 + ",ZobmieStat:" + r._2)
    })

    sparkSession.stop()
  }
}

case class ZombieStats (isZombie:Boolean, lengthOfLife:Long) 
Example 15
Source File: L10-9Graph.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.Graph.graphToGraphOps
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.json4s.DefaultFormats
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object UserRankApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: UserRankApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    ssc.socketTextStream(hostname, port.toInt)
      .map(r => {
        implicit val formats = DefaultFormats
        parse(r)
      })
      .foreachRDD(rdd => {
        val edges = rdd.map(jvalue => {
          implicit val formats = DefaultFormats
          ((jvalue \ "user_id").extract[String], (jvalue \ "friends").extract[Array[String]])
        })
          .flatMap(r => r._2.map(f => Edge(r._1.hashCode.toLong, f.hashCode.toLong, 1.0)))

        val vertices = rdd.map(jvalue => {
          implicit val formats = DefaultFormats
          ((jvalue \ "user_id").extract[String])
        })
          .map(r => (r.hashCode.toLong, r))

        val tolerance = 0.0001
        val graph = Graph(vertices, edges, "defaultUser")
          .subgraph(vpred = (id, idStr) => idStr != "defaultUser")
        val pr = graph.pageRank(tolerance).cache

        graph.outerJoinVertices(pr.vertices) {
          (userId, attrs, rank) => (rank.getOrElse(0.0).asInstanceOf[Number].doubleValue, attrs)
        }.vertices.top(10) {
          Ordering.by(_._2._1)
        }.foreach(rec => println("User id: %s, Rank: %f".format(rec._2._2, rec._2._1)))
      })

    ssc.start()
    ssc.awaitTermination()

  }

} 
Example 16
Source File: EdgeAPI.scala    From Hands-On-Big-Data-Analytics-with-PySpark   with MIT License 5 votes vote down vote up
package com.tomekl007.chapter_7

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

class EdgeAPI extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("Should use Edge API") {
    //given
    val users: RDD[(VertexId, (String))] =
      spark.parallelize(Array(
        (1L, "a"),
        (2L, "b"),
        (3L, "c"),
        (4L, "d")
      ))


    val relationships =
      spark.parallelize(Array(
        Edge(1L, 2L, "friend"),
        Edge(1L, 3L, "friend"),
        Edge(2L, 4L, "wife")
      ))

    val graph = Graph(users, relationships)

    //when
    val res = graph.mapEdges(e => e.attr.toUpperCase)

    println(res.edges.collect().toList)
  }

} 
Example 17
Source File: VertexAPI.scala    From Hands-On-Big-Data-Analytics-with-PySpark   with MIT License 5 votes vote down vote up
package com.tomekl007.chapter_7

import org.apache.spark.SparkContext
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.scalatest.FunSuite

class VertexAPI extends FunSuite {
  val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext

  test("Should use Vertex API") {
    //given
    val users: RDD[(VertexId, (String))] =
      spark.parallelize(Array(
        (1L, "a"),
        (2L, "b"),
        (3L, "c"),
        (4L, "d")
      ))


    val relationships =
      spark.parallelize(Array(
        Edge(1L, 2L, "friend"),
        Edge(1L, 3L, "friend"),
        Edge(2L, 4L, "wife")
      ))

    val graph = Graph(users, relationships)

    //when
    val res = graph.mapVertices((_, att) => att.toUpperCase())
    res.vertices.collect().toList
  }

} 
Example 18
Source File: EmployeeRelationship.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples.graphx

import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.rdd.RDD
import org.apache.spark.graphx.{ Edge, Graph }


object EmployeeRelationship {
	def main(args: Array[String]): Unit = {
		// vertex format: vertex_id, data
		val vertexArray = Array(
			(1L, ("John", "Software Developer")),
			(2L, ("Robert", "Technical Leader")),
			(3L, ("Charlie", "Software Architect")),
			(4L, ("David", "Software Developer")),
			(5L, ("Edward", "Software Development Manager")),
			(6L, ("Francesca", "Software Development Manager")))

		// edge format: from_vertex_id, to_vertex_id, data
		val edgeArray = Array(
			Edge(2L, 1L, "Technical Mentor"),
			Edge(2L, 4L, "Technical Mentor"),
			Edge(3L, 2L, "Collaborator"),
			Edge(6L, 3L, "Team Member"),
			Edge(4L, 1L, "Peers"),
			Edge(5L, 2L, "Team Member"),
			Edge(5L, 3L, "Team Member"),
			Edge(5L, 6L, "Peers"))

		val sc = new SparkContext(new SparkConf().setAppName("EmployeeRelationshipJob"))

		val vertexRDD: RDD[(Long, (String, String))] = sc.parallelize(vertexArray)

		val edgeRDD: RDD[Edge[String]] = sc.parallelize(edgeArray)

		val graph: Graph[(String, String), String] = Graph(vertexRDD, edgeRDD)

		// Vanilla query
		println(">>> Showing the names of people who are Software Developers")
		graph.vertices.filter { case (id, (name, designation)) => designation.equals("Software Developer") }
			.collect()
			.foreach { case (id, (name, designation)) => println(s"... Name: $name, Designation: $designation") }

		// Connection analysis
		println(">>> People connected to Robert (Technical Leader) -> ")
		graph.triplets.filter(_.srcId == 2).collect()
			.foreach { item => println("... " + item.dstAttr._1 + ", " + item.dstAttr._2) }

		println(">>> Robert (Technical Leader) connected to -> ")
		graph.triplets.filter(_.dstId == 2).collect()
			.foreach { item => println("... " + item.srcAttr._1 + ", " + item.srcAttr._2) }

		println(">>> Technical Mentoring Analysis -> ")
		graph.triplets.filter(_.attr.equals("Technical Mentor")).collect()
			.foreach { item => println("... " + item.srcAttr._1 + " mentoring " + item.dstAttr._1) }
	}
} 
Example 19
Source File: LocalRunner.scala    From spark-betweenness   with Apache License 2.0 5 votes vote down vote up
package com.centrality.kBC

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD

object MainRunner 
{
  def main(args: Array[String])
  {
    // Create spark context
    val appName="kBC"
    val sparkMode="local"
    val conf = new SparkConf().setAppName(appName).setMaster(sparkMode);
    val sc = new SparkContext(conf);
    
    // Create sample graph
    //
    // Create an RDD for vertices
    val users: RDD[(VertexId, (String, String))] =
    sc.parallelize(Array((3L, ("rxin", "student")), (7L, ("jgonzal", "postdoc")),
                         (5L, ("franklin", "prof")), (2L, ("istoica", "prof"))))
    // Create an RDD for edges
    val relationships: RDD[Edge[String]] =
      sc.parallelize(Array(Edge(3L, 7L, "collab"),    Edge(5L, 3L, "advisor"),
                           Edge(2L, 5L, "colleague"), Edge(5L, 7L, "pi")))
    // Define a default user in case there are relationship with missing user
    val defaultUser = ("John Doe", "Missing")
    // Build the initial Graph
    val graph = Graph(users, relationships, defaultUser)
    
    val kBCGraph = 
      KBetweenness.run(graph, 3)
  }
} 
Example 20
package com.github.maxpumperla.ml_spark.graphs

import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
//import org.graphframes._


object GraphFramesExample extends App {

    val conf = new SparkConf()
      .setAppName("RDD graph")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)


    val vertices: RDD[(VertexId, String)] = sc.parallelize(
      Array((1L, "Anne"),
        (2L, "Bernie"),
        (3L, "Chris"),
        (4L, "Don"),
        (5L, "Edgar")))

    val edges: RDD[Edge[String]] = sc.parallelize(
      Array(Edge(1L, 2L, "likes"),
        Edge(2L, 3L, "trusts"),
        Edge(3L, 4L, "believes"),
        Edge(4L, 5L, "worships"),
        Edge(1L, 3L, "loves"),
        Edge(4L, 1L, "dislikes")))

    val friendGraph: Graph[String, String] = Graph(vertices, edges)

//    val friendGraphFrame = GraphFrame.fromGraphX(friendGraph)
//
//    friendGraphFrame.find("(v1)-[e1]->(v2); (v2)-[e2]->(v3)").filter(
//      "e1.attr = 'trusts' OR v3.attr = 'Chris'"
//    ).collect.foreach(println)

} 
Example 21
Source File: CCGraphXDriver.scala    From connected-component   with MIT License 5 votes vote down vote up
package com.kwartile.lib.cc

import org.apache.spark.graphx.{Edge, Graph}
import org.apache.spark.{SparkConf, SparkContext}

import scala.annotation.tailrec



object CCGraphXDriver {

  @tailrec
  private def buildEdges(node: Long, neighbors:List[Long], partialPairs: List[Edge[Int]]) : List[Edge[Int]] = {
    if (neighbors.length == 0) {
      if (partialPairs != null)
        List(Edge(node, node, 1)) ::: partialPairs
      else
        List(Edge(node, node, 1))
    } else if (neighbors.length == 1) {
      val neighbor = neighbors(0)
      if (node > neighbor)
        if (partialPairs != null) List(Edge(node, neighbor, 1)) ::: partialPairs else List(Edge(node, neighbor, 1))
      else
      if (partialPairs != null) List(Edge(neighbor, node, 1)) ::: partialPairs else List(Edge(neighbor, node, 1))
    } else {
      val newPartialPairs = neighbors.map(neighbor => {
        if (node > neighbor)
          List(Edge(node, neighbor, 1))
        else
          List(Edge(neighbor, node, 1))
      }).flatMap(x=>x)

      if (partialPairs != null)
        buildEdges(neighbors.head, neighbors.tail, newPartialPairs ::: partialPairs)
      else
        buildEdges(neighbors.head, neighbors.tail, newPartialPairs)
    }
  }

  private def buildEdges(nodes:List[Long]) :  List[Edge[Int]] = {
    buildEdges(nodes.head, nodes.tail, null.asInstanceOf[List[Edge[Int]]])
  }

  def main(args: Array[String]) = {
    val sparkConf = new SparkConf().setAppName("GraphXConnectedComponent")

    val sc = new SparkContext(sparkConf)

    val cliqueFile = args(0)
    val cliquesRec = sc.textFile(args(0))
    val cliques = cliquesRec.map(x => {
      val nodes = x.split("\\s+").map(y => y.toLong).toList
      nodes
    })

    val edges = cliques.map(aClique => {
      buildEdges(aClique)
    }).flatMap(x=>x)

    val graph = Graph.fromEdges(edges, 1)
    val cc = graph.connectedComponents().vertices
    println ("Count of Connected component: " + cc.count)
  }
} 
Example 22
Source File: InputDataFlow.scala    From spark-graphx   with GNU General Public License v3.0 5 votes vote down vote up
package com.github.graphx.pregel.social

import org.apache.spark.graphx.{Edge, VertexId}

import scala.collection.mutable.ListBuffer

object InputDataFlow {

  def parseNames(line: String): Option[(VertexId, String)] = {
    val fields = line.split('\t')
    if (fields.length > 1)
      Some(fields(0).trim().toLong, fields(1))
    else None
  }

  def makeEdges(line: String): List[Edge[Int]] = {
    var edges = new ListBuffer[Edge[Int]]()
    val fields = line.split(" ")
    val origin = fields(0)
    (1 until fields.length)
      .foreach { p =>
        edges += Edge(origin.toLong, fields(p).toLong, 0)
      }
    edges.toList
  }

} 
Example 23
Source File: AbstractPipeClusteringGraph.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.clustering

import org.apache.spark.graphx.Edge
import org.apache.spark.graphx.Graph
import org.apache.spark.graphx.VertexId
import org.apache.spark.rdd.RDD

import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.similarity.aggregator.Mean
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable


abstract class AbstractPipeClusteringGraph
  extends PipeElement[RDD[(SymPair[Tuple], Array[Double])], RDD[Set[Tuple]]]
  with Serializable {
  
  def cluster(graph: Graph[Tuple, Double]): RDD[Set[Tuple]]

  def step(input: RDD[(SymPair[Tuple], Array[Double])])(implicit pipeContext: AbstractPipeContext): RDD[Set[Tuple]] = {
    
    val duplicatePairsWithSimilarity = input.map(
      pair => (pair._1, Mean.agrSimilarity(pair._2))
    )
    
    val edges: RDD[Edge[Double]] = duplicatePairsWithSimilarity.map(
      pair => { Edge(pair._1._1.id, pair._1._2.id, pair._2) }
    )

    // TODO optimize: it would be nice to build the graph only by using edge triplets
    // but as far as I know that's not possible
    val verticesNotUnique: RDD[(VertexId, Tuple)] = duplicatePairsWithSimilarity.map(_._1).flatMap(
      tuplePair => Seq(tuplePair._1, tuplePair._2)
    ).map(tuple => (tuple.id, tuple))

    // delete all duplicate vertices
    val vertices = verticesNotUnique.distinct()

    // The edge type Boolean is just a workaround because no edge types are needed
    val graph: Graph[Tuple, Double] = Graph.apply(vertices, edges, null)
    
    cluster(graph)
  }

}