org.apache.spark.sql.DataFrameReader Scala Examples

The following examples show how to use org.apache.spark.sql.DataFrameReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: SolrDataFrameImplicits.scala    From spark-solr   with Apache License 2.0 5 votes vote down vote up
package com.lucidworks.spark.util

import org.apache.spark.sql.{DataFrameReader, DataFrameWriter, Row, SaveMode}


object SolrDataFrameImplicits {

  implicit class SolrReader(reader: DataFrameReader) {
    def solr(collection: String, query: String = "*:*") =
      reader.format("solr").option("collection", collection).option("query", query).load()
    def solr(collection: String, options: Map[String, String]) =
      reader.format("solr").option("collection", collection).options(options).load()
  }

  implicit class SolrWriter(writer: DataFrameWriter[Row]) {
    def solr(collectionName: String, softCommitSecs: Int = 10, overwrite: Boolean = false, format: String = "solr") = {
      writer
        .format(format)
        .option("collection", collectionName)
        .option("soft_commit_secs", softCommitSecs.toString)
        .mode(if(overwrite) SaveMode.Overwrite else SaveMode.Append)
        .save()
    }
  }
} 
Example 2
Source File: DataFrameReaderFunctions.scala    From spark-bigquery   with Apache License 2.0 5 votes vote down vote up
package com.samelamin.spark.bigquery

import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, DataFrameReader}


  private def buildFrame(options: Map[String, String] = null, schema: StructType = null): DataFrame = {
    val builder = dfr
      .format(source)
      .schema(schema)

    if (options != null) {
      builder.options(options)
    }

    builder.load()
  }
} 
Example 3
Source File: package.scala    From spark-dynamodb   with Apache License 2.0 5 votes vote down vote up
package com.github.traviscrawford.spark

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.DataFrameReader

package object dynamodb {

  implicit class DynamoDBDataFrameReader(reader: DataFrameReader) {

    
    def dynamodb(region: String, table: String): DataFrame =
      reader
        .format("com.github.traviscrawford.spark.dynamodb")
        .option("region", region)
        .option("table", table)
        .load
  }
} 
Example 4
Source File: streaming_coffee.scala    From odsc-west-streaming-trends   with GNU General Public License v3.0 5 votes vote down vote up
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.DataFrameReader

///////////////// NOTE: Open another terminal window (nc -lk 9999) FIRST OR THIS WON'T WORK /////////////////

case class Coffee(
  name: String,
  roast:Int,
  region:String,
  bean: String,
  acidity:Int = 1,
  bitterness:Int = 1,
  flavors: Seq[String]
  )

case class CoffeeRating(
  coffeeName: String,
  score: Int,
  notes: Option[String] = None
  )

// add coffeeStand
// would go in your main() function
//val sparkSession = SparkSession.builder.appName("coffeeShop").getOrCreate()

val sparkSession = spark

val availableCoffee = Seq(
  Coffee(name="folgers", roast=2, region="US", bean="robusta", acidity=7, bitterness=10, flavors=Seq("nutty")),
  Coffee(name="yuban", roast=2, region="Mexico", bean="robusta", acidity=6, bitterness=7, flavors=Seq("nutty")),
  Coffee(name="nespresso", roast=2, region="Cuba", bean="arabica", acidity=5, bitterness=3, flavors=Seq("nutty", "chocolate")),
  Coffee(name="ritual", roast=1, region="Brazil", bean="arabica", acidity=2, bitterness=1, flavors=Seq("fruity", "floral", "chocolate")),
  Coffee(name="four barrel", roast=1, region="Columbia", bean="arabica", flavors=Seq("nutty", "fruity")),
  Coffee(name="french collection", roast=3, region="France", bean="arabica", flavors=Seq("nutty", "fruity"))
  )



import spark.implicits._

def asCoffeeRating(input: String): CoffeeRating = {
    val data = input.split(",")
    val coffeeName = data(0)
    val score = data(1).toInt
    val note = if (data.size > 2) Some(data(2)) else None
    CoffeeRating(coffeeName, score, note)
}

val coffeeStandDF = sparkSession.sparkContext.parallelize(availableCoffee, 3).toDF
val coffeeRatingsReader = sparkSession.readStream.format("socket").option("host", "localhost").option("port", 9999).load()
val rawRatingsData: Dataset[String] = coffeeRatingsReader.as[String]

val coffeeRatingsInput = rawRatingsData.map { asCoffeeRating }.toDF
val coffeeAndRatingsDF = coffeeStandDF.join(coffeeRatingsInput, coffeeStandDF("name") === coffeeRatingsInput("coffeeName"))
val averageRatings = coffeeAndRatingsDF.groupBy(col("name")).agg(avg("score") as "rating").sort(desc("rating"))
val query = averageRatings.writeStream.outputMode("complete").format("console").start()

// nc -lk 9999
//folgers,1
//folgers,2,"gross"
//ritual,5,"awesome"

 
Example 5
Source File: CsvOptions.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv

import org.apache.spark.sql.{DataFrameReader, Row, DataFrameWriter}

import ai.deepsense.deeplang.doperations.inout.CsvParameters
import ai.deepsense.deeplang.doperations.inout.CsvParameters.ColumnSeparatorChoice

object CsvOptions {

  def map(
      namesIncluded: Boolean,
      columnSeparator: ColumnSeparatorChoice): Map[String, String] = {
    val headerFlag = if (namesIncluded) "true" else "false"
    Map(
      "header" -> headerFlag,
      "delimiter" -> CsvParameters.determineColumnSeparatorOf(columnSeparator).toString,
      "inferSchema" -> "false"
    )
  }

  // Unfortunately, making analogous RichDataFrameWriter is awkward, if not impossible.
  // This is because between Spark 1.6 and 2.0 DataFrameWriter became parametrized
  implicit class RichDataFrameReader(self: DataFrameReader) {
    def setCsvOptions(
        namesIncluded: Boolean,
        columnSeparator: ColumnSeparatorChoice): DataFrameReader = {
      val paramMap = map(namesIncluded, columnSeparator)
      paramMap.foldLeft(self) { case (reader, (key, value)) =>
        reader.option(key, value)
      }
    }
  }


} 
Example 6
Source File: DataFrameReaderConfigurator.scala    From sparkling-graph   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package ml.sparkling.graph.loaders.csv

import org.apache.spark.sql.DataFrameReader


object DataFrameReaderConfigurator {

  implicit class addAbilityToConfigureDataFrameReader(reader:DataFrameReader){
    def applyConfiguration(cSVLoaderConfig: CsvLoaderConfig):DataFrameReader={
      reader.option("header",cSVLoaderConfig.header.toString)
      reader.option("delimiter",cSVLoaderConfig.delimiter)
      reader.option("quote",cSVLoaderConfig.quote)
     cSVLoaderConfig.schema match{
       case None => reader.option("inferSchema",cSVLoaderConfig.inferSchema.toString)
       case Some(schema) => reader.schema(schema)
      }
      reader
    }
  }

} 
Example 7
Source File: LibSVMRelation.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.source.libsvm

import com.google.common.base.Objects

import org.apache.spark.Logging
import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrameReader, DataFrame, Row, SQLContext}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}


@Since("1.6.0")
class DefaultSource extends RelationProvider with DataSourceRegister {

  @Since("1.6.0")
  override def shortName(): String = "libsvm"

  @Since("1.6.0")
  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String])
    : BaseRelation = {
    val path = parameters.getOrElse("path",
      throw new IllegalArgumentException("'path' must be specified"))
    val numFeatures = parameters.getOrElse("numFeatures", "-1").toInt
    val vectorType = parameters.getOrElse("vectorType", "sparse")
    new LibSVMRelation(path, numFeatures, vectorType)(sqlContext)
  }
} 
Example 8
Source File: package.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus

import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame, Row }
import org.apache.spark.sql.types.StructType

package object ply {

  
  implicit class PlyDataFrameReader(reader: DataFrameReader) {
    def ply: String => DataFrame = reader.format("fr.ign.spark.iqmulus.ply").load
  }

  implicit class PlyDataFrame(df: DataFrame) {
    def saveAsPly(location: String, littleEndian: Boolean = true) = {
      val df_id = df.drop("pid").drop("fid")
      val schema = df_id.schema
      val saver = (key: Int, iter: Iterator[Row]) =>
        Iterator(iter.saveAsPly(s"$location/$key.ply", schema, littleEndian))
      df_id.rdd.mapPartitionsWithIndex(saver, true).collect
    }
  }

  implicit class PlyRowIterator(iter: Iterator[Row]) {
    def saveAsPly(
      filename: String,
      schema: StructType,
      littleEndian: Boolean
    ) = {
      val path = new org.apache.hadoop.fs.Path(filename)
      val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration)
      val f = fs.create(path)
      val rows = iter.toArray
      val count = rows.size.toLong
      val header = new PlyHeader(filename, littleEndian, Map("vertex" -> ((count, schema))))
      val dos = new java.io.DataOutputStream(f);
      dos.write(header.toString.getBytes)
      val ros = new RowOutputStream(dos, littleEndian, schema)
      rows.foreach(ros.write)
      dos.close
      header
    }
  }
} 
Example 9
Source File: package.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus

import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame, Row }
import org.apache.spark.sql.types.{ FloatType, StructType }

package object xyz {

  
  implicit class XyzDataFrameReader(reader: DataFrameReader) {
    def xyz: String => DataFrame = reader.format("fr.ign.spark.iqmulus.xyz").load
  }

  implicit class XyzDataFrame(df: DataFrame) {
    def saveAsXyz(location: String) = {
      val df_id = df.drop("id")
      require(df_id.schema.fieldNames.take(3) sameElements Array("x", "y", "z"))
      require(df_id.schema.fields.map(_.dataType).take(3).forall(_ == FloatType))
      val saver = (key: Int, iter: Iterator[Row]) => Iterator(iter.saveXyz(s"$location/$key.xyz"))
      df_id.rdd.mapPartitionsWithIndex(saver, true).collect
    }
  }

  implicit class XyzRowIterator(iter: Iterator[Row]) {
    def saveXyz(filename: String) = {
      val path = new org.apache.hadoop.fs.Path(filename)
      val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration)
      val f = fs.create(path)
      val dos = new java.io.DataOutputStream(f)
      var count = 0L
      iter.foreach(row => { count += 1; dos.writeBytes(row.mkString("", "\t", "\n")) })
      dos.close
      (filename, count)
    }
  }
} 
Example 10
Source File: package.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus

import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame }
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.Row

package object las {

  
  implicit class LasDataFrameReader(reader: DataFrameReader) {
    def las: String => DataFrame = reader.format("fr.ign.spark.iqmulus.las").load
  }

  implicit class LasDataFrame(df: DataFrame) {
    def saveAsLas(
      location: String,
      formatOpt: Option[Byte] = None,
      version: Version = Version(),
      scale: Array[Double] = Array(0.01, 0.01, 0.01),
      offset: Array[Double] = Array(0, 0, 0)
    ) = {
      val format = formatOpt.getOrElse(LasHeader.formatFromSchema(df.schema))
      val schema = LasHeader.schema(format) // no user types for now
      val cols = schema.fieldNames.intersect(df.schema.fieldNames)
      val saver = (key: Int, iter: Iterator[Row]) =>
        Iterator(iter.saveAsLas(s"$location/$key.las", schema, format, scale, offset, version))
      df.select(cols.head, cols.tail: _*).rdd.mapPartitionsWithIndex(saver, true).collect
    }
  }

  implicit class LasRowIterator(iter: Iterator[Row]) {
    def saveAsLas(
      filename: String, schema: StructType, format: Byte,
      scale: Array[Double], offset: Array[Double], version: Version = Version()
    ) = {
      // materialize the partition to access it in a single pass, TODO workaround that 
      val rows = iter.toArray
      val count = rows.length.toLong
      val pmin = Array.fill[Double](3)(Double.PositiveInfinity)
      val pmax = Array.fill[Double](3)(Double.NegativeInfinity)
      val countByReturn = Array.fill[Long](15)(0)
      rows.foreach { row =>
        val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble
        val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble
        val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble
        val ret = row.getAs[Byte]("flags") & 0x3
        countByReturn(ret) += 1
        pmin(0) = Math.min(pmin(0), x)
        pmin(1) = Math.min(pmin(1), y)
        pmin(2) = Math.min(pmin(2), z)
        pmax(0) = Math.max(pmax(0), x)
        pmax(1) = Math.max(pmax(1), y)
        pmax(2) = Math.max(pmax(2), z)
      }
      val path = new org.apache.hadoop.fs.Path(filename)
      val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration)
      val f = fs.create(path)
      val header = new LasHeader(filename, format, count, pmin, pmax, scale, offset,
        version = version, pdr_return_nb = countByReturn)
      val dos = new java.io.DataOutputStream(f);
      header.write(dos)
      val ros = new RowOutputStream(dos, littleEndian = true, schema)
      rows.foreach(ros.write)
      dos.close
      header
    }
  }
} 
Example 11
Source File: DataFrameReaderFunctions.scala    From couchbase-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.couchbase.spark.sql

import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, DataFrameReader}

class DataFrameReaderFunctions(@transient val dfr: DataFrameReader) extends Serializable {

  
  private def buildFrame(options: Map[String, String] = null, schema: StructType = null,
    schemaFilter: Option[Filter] = null): DataFrame = {
    val builder = dfr
      .format(source)
      .schema(schema)

    val filter = schemaFilter.map(N1QLRelation.filterToExpression)
    if (filter.isDefined) {
      builder.option("schemaFilter", filter.get)
    }

    if (options != null) {
      builder.options(options)
    }

    builder.load()
  }

} 
Example 12
Source File: AngelTestUtils.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.ml.util

import com.tencent.angel.sona.core.DriverContext
import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.sql.{DataFrameReader, SparkSession}

class AngelTestUtils extends SparkFunSuite {
  protected var spark: SparkSession = _
  protected var libsvm: DataFrameReader = _
  protected var dummy: DataFrameReader = _
  protected var sparkConf: SparkConf = _
  protected var driverCtx: DriverContext = _

  protected override def beforeAll(): Unit = {
    super.beforeAll()
    spark = SparkSession.builder()
      .master("local[2]")
      .appName("AngelClassification")
      .getOrCreate()

    libsvm = spark.read.format("libsvmex")
    dummy = spark.read.format("dummy")
    sparkConf = spark.sparkContext.getConf

    driverCtx = DriverContext.get(sparkConf)
    driverCtx.startAngelAndPSAgent()
  }

  protected override def afterAll(): Unit = {
    super.afterAll()
    driverCtx.stopAngelAndPSAgent()
  }
} 
Example 13
Source File: package.scala    From spark-athena   with Apache License 2.0 5 votes vote down vote up
package io.github.tmheo.spark

import java.util.Properties

import com.amazonaws.athena.jdbc.shaded.com.amazonaws.regions.Regions
import org.apache.spark.sql.{DataFrame, DataFrameReader}

import scala.collection.JavaConverters._

package object athena {

  implicit class AthenaDataFrameReader(reader: DataFrameReader) {

    def athena(table: String): DataFrame = {
      reader.format("io.github.tmheo.spark.athena")
        .option(JDBCOptions.JDBC_TABLE_NAME, table)
        .load
    }

    def athena(table: String, region: String, s3StatingDir: String): DataFrame = {
      reader.format("io.github.tmheo.spark.athena")
        .option(JDBCOptions.JDBC_TABLE_NAME, table)
        .option("region", region)
        .option("s3_staging_dir", s3StatingDir)
        .load
    }

    def athena(table: String, s3StatingDir: String): DataFrame = {
      athena(table, Regions.getCurrentRegion.getName, s3StatingDir)
    }

    def athena(table: String, properties: Properties): DataFrame = {
      val options = properties.asScala
      options += (JDBCOptions.JDBC_TABLE_NAME -> table)
      reader.format("io.github.tmheo.spark.athena").options(options).load
    }

  }

} 
Example 14
Source File: CsvOptions.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv

import org.apache.spark.sql.{DataFrameReader, Row, DataFrameWriter}

import io.deepsense.deeplang.doperations.inout.CsvParameters
import io.deepsense.deeplang.doperations.inout.CsvParameters.ColumnSeparatorChoice

object CsvOptions {

  def map(
      namesIncluded: Boolean,
      columnSeparator: ColumnSeparatorChoice): Map[String, String] = {
    val headerFlag = if (namesIncluded) "true" else "false"
    Map(
      "header" -> headerFlag,
      "delimiter" -> CsvParameters.determineColumnSeparatorOf(columnSeparator).toString,
      "inferSchema" -> "false"
    )
  }

  // Unfortunately, making analogous RichDataFrameWriter is awkward, if not impossible.
  // This is because between Spark 1.6 and 2.0 DataFrameWriter became parametrized
  implicit class RichDataFrameReader(self: DataFrameReader) {
    def setCsvOptions(
        namesIncluded: Boolean,
        columnSeparator: ColumnSeparatorChoice): DataFrameReader = {
      val paramMap = map(namesIncluded, columnSeparator)
      paramMap.foldLeft(self) { case (reader, (key, value)) =>
        reader.option(key, value)
      }
    }
  }


}