org.apache.spark.sql.types.DataTypes Scala Examples

The following examples show how to use org.apache.spark.sql.types.DataTypes. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: TestIndexing.scala    From spark-solr   with Apache License 2.0 5 votes vote down vote up
package com.lucidworks.spark

import java.util.UUID

import com.lucidworks.spark.util.SolrDataFrameImplicits._
import com.lucidworks.spark.util.{ConfigurationConstants, SolrCloudUtil, SolrQuerySupport, SolrSupport}
import org.apache.spark.sql.functions.{concat, lit}
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}

class TestIndexing extends TestSuiteBuilder {

  test("Load csv file and index to Solr") {
    val collectionName = "testIndexing-" + UUID.randomUUID().toString
    SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc)
    try {
      val csvFileLocation = "src/test/resources/test-data/nyc_yellow_taxi_sample_1k.csv"
      val csvDF = sparkSession.read.format("com.databricks.spark.csv")
        .option("header", "true")
        .option("inferSchema", "true")
        .load(csvFileLocation)
      assert(csvDF.count() == 999)

      val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName)
      val newDF = csvDF
        .withColumn("pickup_location", concat(csvDF.col("pickup_latitude"), lit(","), csvDF.col("pickup_longitude")))
        .withColumn("dropoff_location", concat(csvDF.col("dropoff_latitude"), lit(","), csvDF.col("dropoff_longitude")))
      newDF.write.option("zkhost", zkHost).option(ConfigurationConstants.GENERATE_UNIQUE_KEY, "true").solr(collectionName)

      // Explicit commit to make sure all docs are visible
      val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost)
      solrCloudClient.commit(collectionName, true, true)

      val solrDF = sparkSession.read.format("solr").options(solrOpts).load()
      solrDF.printSchema()
      assert (solrDF.count() == 999)
      solrDF.take(10)
    } finally {
      SolrCloudUtil.deleteCollection(collectionName, cluster)
    }
  }

  test("Solr field types config") {
    val collectionName = "testIndexing-" + UUID.randomUUID().toString
    SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc)
    try {
      val csvFileLocation = "src/test/resources/test-data/simple.csv"
      val csvDF = sparkSession.read.format("com.databricks.spark.csv")
          .option("header", "true")
          .option("inferSchema", "true")
          .load(csvFileLocation)
      val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName, ConfigurationConstants.SOLR_FIELD_TYPES -> "ntitle:text_en,nrating:string")
      csvDF.write.options(solrOpts).solr(collectionName)

      // Explicit commit to make sure all docs are visible
      val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost)
      solrCloudClient.commit(collectionName, true, true)

      val solrBaseUrl = SolrSupport.getSolrBaseUrl(zkHost)
      val solrUrl = solrBaseUrl + collectionName + "/"

      val fieldTypes = SolrQuerySupport.getFieldTypes(Set.empty, solrUrl, cloudClient, collectionName)
      assert(fieldTypes("nrating").fieldType === "string")
      assert(fieldTypes("ntitle").fieldType === "text_en")
    } finally {
      SolrCloudUtil.deleteCollection(collectionName, cluster)
    }
  }


  test("Field additions") {
    val insertSchema = StructType(Array(
      StructField("index_only_field", DataTypes.StringType, nullable = true),
      StructField("store_only_field", DataTypes.BooleanType, nullable = true),
      StructField("a_s", DataTypes.StringType, nullable = true),
      StructField("s_b", DataTypes.StringType, nullable = true)
    ))
    val collection = "testFieldAdditions" + UUID.randomUUID().toString.replace("-", "_")
    try {
      SolrCloudUtil.buildCollection(zkHost, collection, null, 2, cloudClient, sc)
      val opts = Map("zkhost" -> zkHost, "collection" -> collection)

      val solrRelation = new SolrRelation(opts, sparkSession)
      val fieldsToAdd = SolrRelation.getFieldsToAdd(insertSchema, solrRelation.conf, solrRelation.solrVersion, solrRelation.dynamicSuffixes)
      assert(fieldsToAdd.isEmpty)
    } finally {
      SolrCloudUtil.deleteCollection(collection, cluster)
    }
  }

} 
Example 2
Source File: Surrogate.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.tuner.surrogate

import com.tencent.angel.spark.automl.tuner.config.ConfigurationSpace
import org.apache.commons.logging.{Log, LogFactory}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}

import scala.collection.mutable.ArrayBuffer


  def predict(X: Vector): (Double, Double)

  def stop(): Unit

  def curBest: (Vector, Double) = {
    if (minimize) curMin else curMax
  }

  def curMin: (Vector, Double) = {
    if (preY.isEmpty)
      (null, Double.MaxValue)
    else {
      val maxIdx: Int = preY.zipWithIndex.max._2
      (preX(maxIdx), -preY(maxIdx))
    }
  }

  def curMax: (Vector, Double) = {
    if (preY.isEmpty)
      (null, Double.MinValue)
    else {
      val maxIdx: Int = preY.zipWithIndex.max._2
      (preX(maxIdx), preY(maxIdx))
    }
  }
} 
Example 3
Source File: HousePriceDataBusinessLogic.scala    From bdd-spark   with MIT License 5 votes vote down vote up
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
import org.json4s._
import org.json4s.jackson.JsonMethods._

object HousePriceDataBusinessLogic {
  import Spark._

  def processHousePrices(housePrices : DataFrame, postcodes : DataFrame) : DataFrame = {
    housePrices.join(postcodes, "Postcode")
  }

  def processHousePricesAndSaveToParquet(housePrices : DataFrame, postcodes : DataFrame, parquetWriter: ParquetWriter) : Unit = {
    parquetWriter.write(housePrices.join(postcodes, "Postcode"), "results.parquet")
  }

  def processDataFromFilesFilterItThenSaveItToParquet(reader: FileReader,
                                                      geoFilename : String,
                                                      priceFilename: String,
                                                      postcodeFileName: String,
                                                      writer: ParquetWriter) : Unit = {
    val joined = loadAndJoin(reader, priceFilename, postcodeFileName)

    // If this was real code, a geoJSON library would be sensible here. Dirty code follows:
    val json = parse(reader.readText(geoFilename))\\ "coordinates"
    val coords = json match {
      case JArray(outer) => outer.map{ case JArray(inner) => inner }
    }
    val points =
      coords
        .map(c => (c(0), c(1)))
        .map{ case (JDouble(long), JDouble(lat)) => (long, lat) }

    val minLat = Math.min(points(0)._2, points(1)._2)
    val maxLat = Math.max(points(0)._2, points(1)._2)
    val minLong = Math.min(points(0)._1, points(1)._1)
    val maxLong = Math.max(points(0)._1, points(1)._1)

    val filtered = joined
      .filter(s"Latitude >= $minLat and Latitude <= $maxLat")
      .filter(s"Longitude >= $minLong and Longitude <= $maxLong")

    writer.write(filtered, "results.parquet")
  }

  def processDataFromFilesAndSaveToParquet(reader: FileReader, priceFilename: String, postcodeFileName: String, writer: ParquetWriter) : Unit = {
    val joined = loadAndJoin(reader, priceFilename, postcodeFileName)

    writer.write(joined, "results.parquet")
  }

  private def loadAndJoin(reader: FileReader, priceFilename: String, postcodeFileName: String): DataFrame = {
    val priceSchema = StructType(Seq(
      StructField("Price", DataTypes.IntegerType),
      StructField("Postcode", DataTypes.StringType),
      StructField("HouseType", DataTypes.StringType)
    ))

    val prices = reader
      .readLinesToRdd(priceFilename)
      .map(_.split(','))
      .map(row => row.map(_.trim()))
      .map(splits => Row(splits(0).toInt, splits(1), splits(2)))

    val priceDf = spark.createDataFrame(prices, priceSchema)

    val postcodeSchema = StructType(Seq(
      StructField("Postcode", DataTypes.StringType),
      StructField("Latitude", DataTypes.DoubleType),
      StructField("Longitude", DataTypes.DoubleType)
    ))

    val postcodes = reader
      .readLinesToRdd(postcodeFileName)
      .map(_.split(','))
      .map(row => row.map(_.trim()))
      .map(splits => Row(splits(0), splits(1).toDouble, splits(2).toDouble))

    val postcodeDf = spark.createDataFrame(postcodes, postcodeSchema)

    val joined = priceDf.join(postcodeDf, "Postcode")
    joined
  }
} 
Example 4
Source File: DataTypeMapping.scala    From azure-kusto-spark   with Apache License 2.0 5 votes vote down vote up
package com.microsoft.kusto.spark.utils

import org.apache.spark.sql.types.DataTypes._
import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, MapType, StructType}

object DataTypeMapping {

  val kustoTypeToSparkTypeMap: Map[String, DataType] = Map(
    "string" -> StringType,
    "long" -> LongType,
    "datetime" -> TimestampType,// Kusto datetime is equivalent to TimestampType
    "timespan" -> StringType,
    "bool" -> BooleanType,
    "real" -> DoubleType,
    // Can be partitioned differently between precision and scale, total must be 34 to match .Net SqlDecimal
    "decimal" -> DataTypes.createDecimalType(20,14),
    "guid" -> StringType,
    "int" -> IntegerType,
    "dynamic" -> StringType
  )

  val kustoJavaTypeToSparkTypeMap: Map[String, DataType] = Map(
    "string" -> StringType,
    "int64" -> LongType,
    "datetime" -> TimestampType,
    "timespan" -> StringType,
    "sbyte" -> BooleanType,
    "double" -> DoubleType,
    "sqldecimal" -> DataTypes.createDecimalType(20,14),
    "guid" -> StringType,
    "int32" -> IntegerType,
    "object" -> StringType
  )

  val sparkTypeToKustoTypeMap: Map[DataType, String] = Map(
    StringType ->  "string",
    BooleanType -> "bool",
    DateType -> "datetime",
    TimestampType -> "datetime",
    DataTypes.createDecimalType() -> "decimal",
    DoubleType -> "real",
    FloatType -> "real",
    ByteType -> "int",
    IntegerType -> "int",
    LongType ->  "long",
    ShortType ->  "int"
  )

  def getSparkTypeToKustoTypeMap(fieldType: DataType): String ={
      if(fieldType.isInstanceOf[DecimalType]) "decimal"
      else if (fieldType.isInstanceOf[ArrayType] || fieldType.isInstanceOf[StructType] || fieldType.isInstanceOf[MapType]) "dynamic"
      else DataTypeMapping.sparkTypeToKustoTypeMap.getOrElse(fieldType, "string")
  }
} 
Example 5
Source File: ParallelPersonalizedPageRankSuite.scala    From graphframes   with Apache License 2.0 5 votes vote down vote up
package org.graphframes.lib

import com.github.zafarkhaja.semver.Version

import org.apache.spark.ml.linalg.{SQLDataTypes, SparseVector}
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.DataTypes

import org.graphframes.examples.Graphs
import org.graphframes.{GraphFrameTestSparkContext, SparkFunSuite, TestUtils}

class ParallelPersonalizedPageRankSuite extends SparkFunSuite with GraphFrameTestSparkContext {

  val n = 100

  test("Illegal function call argument setting") {
    val g = Graphs.star(n)
    val vertexIds: Array[Any] = Array(1L, 2L, 3L)

    // Not providing number of iterations
    intercept[IllegalArgumentException] {
      g.parallelPersonalizedPageRank.sourceIds(vertexIds).run()
    }

    // Not providing sourceIds
    intercept[IllegalArgumentException] {
      g.parallelPersonalizedPageRank.maxIter(15).run()
    }

    // Provided empty sourceIds
    intercept[IllegalArgumentException] {
      g.parallelPersonalizedPageRank.maxIter(15).sourceIds(Array()).run()
    }
  }

  test("Star example parallel personalized PageRank") {
    val g = Graphs.star(n)
    val resetProb = 0.15
    val maxIter = 10
    val vertexIds: Array[Any] = Array(1L, 2L, 3L)

    lazy val prc = g.parallelPersonalizedPageRank
      .maxIter(maxIter)
      .sourceIds(vertexIds)
      .resetProbability(resetProb)

    val pr = prc.run()
    TestUtils.testSchemaInvariants(g, pr)
    TestUtils.checkColumnType(pr.vertices.schema, "pageranks", SQLDataTypes.VectorType)
    TestUtils.checkColumnType(pr.edges.schema, "weight", DataTypes.DoubleType)
  }

  // In Spark <2.4, sourceIds must be smaller than Int.MaxValue,
  // which might not be the case for LONG_ID in graph.indexedVertices.
  if (Version.valueOf(org.apache.spark.SPARK_VERSION)
    .greaterThanOrEqualTo(Version.valueOf("2.4.0"))) {
    test("friends graph with parallel personalized PageRank") {
      val g = Graphs.friends
      val resetProb = 0.15
      val maxIter = 10
      val vertexIds: Array[Any] = Array("a")
      lazy val prc = g.parallelPersonalizedPageRank
        .maxIter(maxIter)
        .sourceIds(vertexIds)
        .resetProbability(resetProb)

      val pr = prc.run()
      val prInvalid = pr.vertices
        .select("pageranks")
        .collect()
        .filter { row: Row =>
          vertexIds.size != row.getAs[SparseVector](0).size
        }
      assert(prInvalid.size === 0,
        s"found ${prInvalid.size} entries with invalid number of returned personalized pagerank vector")

      val gRank = pr.vertices
        .filter(col("id") === "g")
        .select("pageranks")
        .first().getAs[SparseVector](0)
      assert(gRank.numNonzeros === 0,
        s"User g (Gabby) doesn't connect with a. So its pagerank should be 0 but we got ${gRank.numNonzeros}.")
    }
  }
} 
Example 6
Source File: PageRankSuite.scala    From graphframes   with Apache License 2.0 5 votes vote down vote up
package org.graphframes.lib

import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.DataTypes

import org.graphframes.examples.Graphs
import org.graphframes.{GraphFrameTestSparkContext, SparkFunSuite, TestUtils}

class PageRankSuite extends SparkFunSuite with GraphFrameTestSparkContext {

  val n = 100

  test("Star example") {
    val g = Graphs.star(n)
    val resetProb = 0.15
    val errorTol = 1.0e-5
    val pr = g.pageRank
      .resetProbability(resetProb)
      .tol(errorTol).run()
    TestUtils.testSchemaInvariants(g, pr)
    TestUtils.checkColumnType(pr.vertices.schema, "pagerank", DataTypes.DoubleType)
    TestUtils.checkColumnType(pr.edges.schema, "weight", DataTypes.DoubleType)
  }

  test("friends graph with personalized PageRank") {
    val results = Graphs.friends.pageRank.resetProbability(0.15).maxIter(10).sourceId("a").run()

    val gRank = results.vertices.filter(col("id") === "g").select("pagerank").first().getDouble(0)
    assert(gRank === 0.0,
      s"User g (Gabby) doesn't connect with a. So its pagerank should be 0 but we got $gRank.")
  }
} 
Example 7
Source File: SVDPlusPlusSuite.scala    From graphframes   with Apache License 2.0 5 votes vote down vote up
package org.graphframes.lib

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.DataTypes

import org.graphframes.{GraphFrame, GraphFrameTestSparkContext, SparkFunSuite, TestUtils}
import org.graphframes.examples.Graphs


class SVDPlusPlusSuite extends SparkFunSuite with GraphFrameTestSparkContext {

  test("Test SVD++ with mean square error on training set") {
    val svdppErr = 8.0
    val g = Graphs.ALSSyntheticData()

    val v2 = g.svdPlusPlus.maxIter(2).run()
    TestUtils.testSchemaInvariants(g, v2)
    Seq(SVDPlusPlus.COLUMN1, SVDPlusPlus.COLUMN2).foreach { case c =>
      TestUtils.checkColumnType(v2.schema, c,
        DataTypes.createArrayType(DataTypes.DoubleType, false))
    }
    Seq(SVDPlusPlus.COLUMN3, SVDPlusPlus.COLUMN4).foreach { case c =>
      TestUtils.checkColumnType(v2.schema, c, DataTypes.DoubleType)
    }
    val err = v2.select(GraphFrame.ID, SVDPlusPlus.COLUMN4).rdd.map {
      case Row(vid: Long, vd: Double) =>
        if (vid % 2 == 1) vd else 0.0
    }.reduce(_ + _) / g.edges.count()
    assert(err <= svdppErr)
  }
} 
Example 8
Source File: TriangleCountSuite.scala    From graphframes   with Apache License 2.0 5 votes vote down vote up
package org.graphframes.lib

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.DataTypes

import org.graphframes.{GraphFrameTestSparkContext, GraphFrame, SparkFunSuite, TestUtils}

class TriangleCountSuite extends SparkFunSuite with GraphFrameTestSparkContext {

  test("Count a single triangle") {
    val edges = sqlContext.createDataFrame(Array(0L -> 1L, 1L -> 2L, 2L -> 0L)).toDF("src", "dst")
    val vertices = sqlContext.createDataFrame(Seq((0L, "a"), (1L, "b"), (2L, "c")))
      .toDF("id", "a")
    val g = GraphFrame(vertices, edges)
    val v2 = g.triangleCount.run()
    TestUtils.testSchemaInvariants(g, v2)
    TestUtils.checkColumnType(v2.schema, "count", DataTypes.LongType)
    v2.select("id", "count", "a")
      .collect().foreach { case Row(vid: Long, count: Long, _) => assert(count === 1) }
  }

  test("Count two triangles") {
    val edges = sqlContext.createDataFrame(Array(0L -> 1L, 1L -> 2L, 2L -> 0L) ++
      Array(0L -> -1L, -1L -> -2L, -2L -> 0L)).toDF("src", "dst")
    val g = GraphFrame.fromEdges(edges)
    val v2 = g.triangleCount.run()
    v2.select("id", "count").collect().foreach { case Row(id: Long, count: Long) =>
      if (id == 0) {
        assert(count === 2)
      } else {
        assert(count === 1)
      }
    }
  }

  test("Count one triangles with bi-directed edges") {
    // Note: This is different from GraphX, which double-counts triangles with bidirected edges.
    val triangles = Array(0L -> 1L, 1L -> 2L, 2L -> 0L) ++ Array(0L -> -1L, -1L -> -2L, -2L -> 0L)
    val revTriangles = triangles.map { case (a, b) => (b, a) }
    val edges = sqlContext.createDataFrame(triangles ++ revTriangles).toDF("src", "dst")
    val g = GraphFrame.fromEdges(edges)
    val v2 = g.triangleCount.run()
    v2.select("id", "count").collect().foreach { case Row(id: Long, count: Long) =>
      if (id == 0) {
        assert(count === 2)
      } else {
        assert(count === 1)
      }
    }
  }

  test("Count a single triangle with duplicate edges") {
    val edges = sqlContext.createDataFrame(Array(0L -> 1L, 1L -> 2L, 2L -> 0L) ++
        Array(0L -> 1L, 1L -> 2L, 2L -> 0L)).toDF("src", "dst")
    val g = GraphFrame.fromEdges(edges)
    val v2 = g.triangleCount.run()
    v2.select("id", "count").collect().foreach { case Row(id: Long, count: Long) =>
      assert(count === 1)
    }
  }

  test("no triangle") {
    val edges = sqlContext.createDataFrame(Array(0L -> 1L, 1L -> 2L)).toDF("src", "dst")
    val g = GraphFrame.fromEdges(edges)
    val v2 = g.triangleCount.run()
    v2.select("count").collect().foreach { case Row(count: Long) =>
      assert(count === 0)
    }
  }
} 
Example 9
Source File: LabelPropagationSuite.scala    From graphframes   with Apache License 2.0 5 votes vote down vote up
package org.graphframes.lib

import org.apache.spark.sql.types.DataTypes

import org.graphframes.{GraphFrameTestSparkContext, SparkFunSuite, TestUtils}
import org.graphframes.examples.Graphs

class LabelPropagationSuite extends SparkFunSuite with GraphFrameTestSparkContext {

  val n = 5

  test("Toy example") {
    val g = Graphs.twoBlobs(n)
    val labels = g.labelPropagation.maxIter(4 * n).run()
    TestUtils.testSchemaInvariants(g, labels)
    TestUtils.checkColumnType(labels.schema, "label", DataTypes.LongType)
    val clique1 =
      labels.filter(s"id < $n").select("label").collect().toSeq.map(_.getLong(0)).toSet
    assert(clique1.size === 1)
    val clique2 =
      labels.filter(s"id >= $n").select("label").collect().toSeq.map(_.getLong(0)).toSet
    assert(clique2.size === 1)
    assert(clique1 !== clique2)
  }
} 
Example 10
Source File: ShortestPathsSuite.scala    From graphframes   with Apache License 2.0 5 votes vote down vote up
package org.graphframes.lib

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.DataTypes

import org.graphframes._

class ShortestPathsSuite extends SparkFunSuite with GraphFrameTestSparkContext {

  test("Simple test") {
    val edgeSeq = Seq((1, 2), (1, 5), (2, 3), (2, 5), (3, 4), (4, 5), (4, 6)).flatMap {
      case e => Seq(e, e.swap)
    } .map { case (src, dst) => (src.toLong, dst.toLong) }
    val edges = sqlContext.createDataFrame(edgeSeq).toDF("src", "dst")
    val graph = GraphFrame.fromEdges(edges)

    // Ground truth
    val shortestPaths = Set(
      (1, Map(1 -> 0, 4 -> 2)), (2, Map(1 -> 1, 4 -> 2)), (3, Map(1 -> 2, 4 -> 1)),
      (4, Map(1 -> 2, 4 -> 0)), (5, Map(1 -> 1, 4 -> 1)), (6, Map(1 -> 3, 4 -> 1)))

    val landmarks = Seq(1, 4).map(_.toLong)
    val v2 = graph.shortestPaths.landmarks(landmarks).run()

    TestUtils.testSchemaInvariants(graph, v2)
    TestUtils.checkColumnType(v2.schema, "distances",
      DataTypes.createMapType(v2.schema("id").dataType, DataTypes.IntegerType, false))
    val newVs = v2.select("id", "distances").collect().toSeq
    val results = newVs.map {
      case Row(id: Long, spMap: Map[Long, Int] @unchecked) => (id, spMap)
    }
    assert(results.toSet === shortestPaths)
  }

  test("friends graph") {
    val friends = examples.Graphs.friends
    val v = friends.shortestPaths.landmarks(Seq("a", "d")).run()
    val expected = Set[(String, Map[String, Int])](("a", Map("a" -> 0, "d" -> 2)), ("b", Map.empty),
      ("c", Map.empty), ("d", Map("a" -> 1, "d" -> 0)), ("e", Map("a" -> 2, "d" -> 1)),
      ("f", Map.empty), ("g", Map.empty))
    val results = v.select("id", "distances").collect().map {
      case Row(id: String, spMap: Map[String, Int] @unchecked) =>
        (id, spMap)
    }.toSet
    assert(results === expected)
  }

} 
Example 11
Source File: StronglyConnectedComponentsSuite.scala    From graphframes   with Apache License 2.0 5 votes vote down vote up
package org.graphframes.lib

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.DataTypes

import org.graphframes.{GraphFrameTestSparkContext, GraphFrame, SparkFunSuite, TestUtils}

class StronglyConnectedComponentsSuite extends SparkFunSuite with GraphFrameTestSparkContext {
  test("Island Strongly Connected Components") {
    val vertices = sqlContext.createDataFrame(Seq(
      (1L, "a"),
      (2L, "b"),
      (3L, "c"),
      (4L, "d"),
      (5L, "e"))).toDF("id", "value")
    val edges = sqlContext.createDataFrame(Seq.empty[(Long, Long)]).toDF("src", "dst")
    val graph = GraphFrame(vertices, edges)
    val c = graph.stronglyConnectedComponents.maxIter(5).run()
    TestUtils.testSchemaInvariants(graph, c)
    TestUtils.checkColumnType(c.schema, "component", DataTypes.LongType)
    for (Row(id: Long, component: Long, _)
         <- c.select("id", "component", "value").collect()) {
      assert(id === component)
    }
  }
} 
Example 12
Source File: UnaryTransformerExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.DoubleParam
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.{DataType, DataTypes}
import org.apache.spark.util.Utils
// $example off$


  object MyTransformer extends DefaultParamsReadable[MyTransformer]
  // $example off$

  def main(args: Array[String]) {
    val spark = SparkSession
      .builder()
      .appName("UnaryTransformerExample")
      .getOrCreate()

    // $example on$
    val myTransformer = new MyTransformer()
      .setShift(0.5)
      .setInputCol("input")
      .setOutputCol("output")

    // Create data, transform, and display it.
    val data = spark.range(0, 5).toDF("input")
      .select(col("input").cast("double").as("input"))
    val result = myTransformer.transform(data)
    println("Transformed by adding constant value")
    result.show()

    // Save and load the Transformer.
    val tmpDir = Utils.createTempDir()
    val dirName = tmpDir.getCanonicalPath
    myTransformer.write.overwrite().save(dirName)
    val sameTransformer = MyTransformer.load(dirName)

    // Transform the data to show the results are identical.
    println("Same transform applied from loaded model")
    val sameResult = sameTransformer.transform(data)
    sameResult.show()

    Utils.deleteRecursively(tmpDir)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 13
Source File: Util.scala    From mimir   with Apache License 2.0 5 votes vote down vote up
package mimir.exec.spark.datasource.google.spreadsheet

import com.google.api.services.sheets.v4.model.{ExtendedValue, CellData, RowData}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{DataTypes, StructType}

import scala.collection.JavaConverters._

object Util {
  def convert(schema: StructType, row: Row): Map[String, Object] =
    schema.iterator.zipWithIndex.map { case (f, i) => f.name -> row(i).asInstanceOf[AnyRef]}.toMap

  def toRowData(row: Row): RowData =
      new RowData().setValues(
        row.schema.fields.zipWithIndex.map { case (f, i) =>
          new CellData()
            .setUserEnteredValue(
              f.dataType match {
                case DataTypes.StringType => new ExtendedValue().setStringValue(row.getString(i))
                case DataTypes.LongType => new ExtendedValue().setNumberValue(row.getLong(i).toDouble)
                case DataTypes.IntegerType => new ExtendedValue().setNumberValue(row.getInt(i).toDouble)
                case DataTypes.FloatType => new ExtendedValue().setNumberValue(row.getFloat(i).toDouble)
                case DataTypes.BooleanType => new ExtendedValue().setBoolValue(row.getBoolean(i))
                case DataTypes.DateType => new ExtendedValue().setStringValue(row.getDate(i).toString)
                case DataTypes.ShortType => new ExtendedValue().setNumberValue(row.getShort(i).toDouble)
                case DataTypes.TimestampType => new ExtendedValue().setStringValue(row.getTimestamp(i).toString)
                case DataTypes.DoubleType => new ExtendedValue().setNumberValue(row.getDouble(i))
              }
            )
        }.toList.asJava
      )

} 
Example 14
Source File: LOFSuite.scala    From spark-lof   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.outlier

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
import org.apache.spark.sql.functions._

object LOFSuite {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("LOFExample")
      .master("local[4]")
      .getOrCreate()

    val schema = new StructType(Array(
      new StructField("col1", DataTypes.DoubleType),
      new StructField("col2", DataTypes.DoubleType)))
    val df = spark.read.schema(schema).csv("data/outlier.csv")

    val assembler = new VectorAssembler()
      .setInputCols(df.columns)
      .setOutputCol("features")
    val data = assembler.transform(df).repartition(4)

    val startTime = System.currentTimeMillis()
    val result = new LOF()
      .setMinPts(5)
      .transform(data)
    val endTime = System.currentTimeMillis()
    result.count()

    // Outliers have much higher LOF value than normal data
    result.sort(desc(LOF.lof)).head(10).foreach { row =>
      println(row.get(0) + " | " + row.get(1) + " | " + row.get(2))
    }
    println("Total time = " + (endTime - startTime) / 1000.0 + "s")
  }
} 
Example 15
Source File: Util.scala    From spark-google-spreadsheets   with Apache License 2.0 5 votes vote down vote up
package com.github.potix2.spark.google.spreadsheets

import com.google.api.services.sheets.v4.model.{ExtendedValue, CellData, RowData}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{DataTypes, StructType}

import scala.collection.JavaConverters._

object Util {
  def convert(schema: StructType, row: Row): Map[String, Object] =
    schema.iterator.zipWithIndex.map { case (f, i) => f.name -> row(i).asInstanceOf[AnyRef]} toMap

  def toRowData(row: Row): RowData =
      new RowData().setValues(
        row.schema.fields.zipWithIndex.map { case (f, i) =>
          new CellData()
            .setUserEnteredValue(
              f.dataType match {
                case DataTypes.StringType => new ExtendedValue().setStringValue(row.getString(i))
                case DataTypes.LongType => new ExtendedValue().setNumberValue(row.getLong(i).toDouble)
                case DataTypes.IntegerType => new ExtendedValue().setNumberValue(row.getInt(i).toDouble)
                case DataTypes.FloatType => new ExtendedValue().setNumberValue(row.getFloat(i).toDouble)
                case DataTypes.BooleanType => new ExtendedValue().setBoolValue(row.getBoolean(i))
                case DataTypes.DateType => new ExtendedValue().setStringValue(row.getDate(i).toString)
                case DataTypes.ShortType => new ExtendedValue().setNumberValue(row.getShort(i).toDouble)
                case DataTypes.TimestampType => new ExtendedValue().setStringValue(row.getTimestamp(i).toString)
                case DataTypes.DoubleType => new ExtendedValue().setNumberValue(row.getDouble(i))
              }
            )
        }.toList.asJava
      )

} 
Example 16
Source File: SparkSpreadsheetServiceWriteSuite.scala    From spark-google-spreadsheets   with Apache License 2.0 5 votes vote down vote up
package com.github.potix2.spark.google.spreadsheets

import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService.SparkSpreadsheet
import com.google.api.services.sheets.v4.model.{ExtendedValue, CellData, RowData}
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
import org.scalatest.{BeforeAndAfter, FlatSpec}

import scala.collection.JavaConverters._

class SparkSpreadsheetServiceWriteSuite extends FlatSpec with BeforeAndAfter {
  private val serviceAccountId = "53797494708-ds5v22b6cbpchrv2qih1vg8kru098k9i@developer.gserviceaccount.com"
  private val testCredentialPath = "src/test/resources/spark-google-spreadsheets-test-eb7b191d1e1d.p12"
  private val TEST_SPREADSHEET_NAME = "WriteSuite"
  private val TEST_SPREADSHEET_ID = "163Ja2OWUephWjIa-jpwTlvGcg8EJwCFCfxrF7aI117s"

  private val context: SparkSpreadsheetService.SparkSpreadsheetContext =
    SparkSpreadsheetService.SparkSpreadsheetContext(Some(serviceAccountId), new java.io.File(testCredentialPath))

  var spreadsheet: SparkSpreadsheet = null
  var worksheetName: String = ""

  def definedSchema: StructType = {
    new StructType()
      .add(new StructField("col_1", DataTypes.StringType))
      .add(new StructField("col_2", DataTypes.LongType))
      .add(new StructField("col_3", DataTypes.StringType))
  }

  case class Elem(col_1: String, col_2: Long, col_3: String)

  def extractor(e: Elem): RowData =
    new RowData().setValues(
      List(
        new CellData().setUserEnteredValue(
          new ExtendedValue().setStringValue(e.col_1)
        ),
        new CellData().setUserEnteredValue(
          new ExtendedValue().setNumberValue(e.col_2.toDouble)
        ),
        new CellData().setUserEnteredValue(
          new ExtendedValue().setStringValue(e.col_3)
        )
      ).asJava
    )

  before {
    spreadsheet = context.findSpreadsheet(TEST_SPREADSHEET_ID)
    worksheetName = scala.util.Random.alphanumeric.take(16).mkString
    val data = List(
      Elem("a", 1L, "x"),
      Elem("b", 2L, "y"),
      Elem("c", 3L, "z")
    )

    spreadsheet.addWorksheet(worksheetName, definedSchema, data, extractor)
  }

  after {
    spreadsheet.deleteWorksheet(worksheetName)
  }

  behavior of "A Spreadsheet"
  it should "find the new worksheet" in {
    val newWorksheet = spreadsheet.findWorksheet(worksheetName)
    assert(newWorksheet.isDefined)
    assert(newWorksheet.get.name == worksheetName)
    assert(newWorksheet.get.headers == Seq("col_1", "col_2", "col_3"))

    val rows = newWorksheet.get.rows
    assert(rows.head == Map("col_1" -> "a", "col_2" -> "1", "col_3" -> "x"))
  }

  behavior of "SparkWorksheet#updateCells"
  it should "update values in a worksheet" in {
    val newWorksheet = spreadsheet.findWorksheet(worksheetName)
    assert(newWorksheet.isDefined)

    val newData = List(
      Elem("f", 5L, "yy"),
      Elem("e", 4L, "xx"),
      Elem("c", 3L, "z"),
      Elem("b", 2L, "y"),
      Elem("a", 1L, "x")
    )

    newWorksheet.get.updateCells(definedSchema, newData, extractor)

    val rows = newWorksheet.get.rows
    assert(rows.head == Map("col_1" -> "f", "col_2" -> "5", "col_3" -> "yy"))
    assert(rows.last == Map("col_1" -> "a", "col_2" -> "1", "col_3" -> "x"))
  }
}