org.apache.spark.sql.DataFrameWriter Scala Examples

The following examples show how to use org.apache.spark.sql.DataFrameWriter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: SolrDataFrameImplicits.scala    From spark-solr   with Apache License 2.0 5 votes vote down vote up
package com.lucidworks.spark.util

import org.apache.spark.sql.{DataFrameReader, DataFrameWriter, Row, SaveMode}


object SolrDataFrameImplicits {

  implicit class SolrReader(reader: DataFrameReader) {
    def solr(collection: String, query: String = "*:*") =
      reader.format("solr").option("collection", collection).option("query", query).load()
    def solr(collection: String, options: Map[String, String]) =
      reader.format("solr").option("collection", collection).options(options).load()
  }

  implicit class SolrWriter(writer: DataFrameWriter[Row]) {
    def solr(collectionName: String, softCommitSecs: Int = 10, overwrite: Boolean = false, format: String = "solr") = {
      writer
        .format(format)
        .option("collection", collectionName)
        .option("soft_commit_secs", softCommitSecs.toString)
        .mode(if(overwrite) SaveMode.Overwrite else SaveMode.Append)
        .save()
    }
  }
} 
Example 2
Source File: SparkExtension.scala    From azure-kusto-spark   with Apache License 2.0 5 votes vote down vote up
package com.microsoft.kusto.spark.sql.extension

import com.microsoft.azure.kusto.data.ClientRequestProperties
import com.microsoft.kusto.spark.datasink.{KustoSinkOptions, SparkIngestionProperties}
import com.microsoft.kusto.spark.datasource.KustoSourceOptions
import org.apache.spark.sql.streaming.DataStreamWriter
import org.apache.spark.sql.{DataFrameWriter, _}

object SparkExtension {

  implicit class DataFrameReaderExtension(df: DataFrameReader) {

    def kusto(kustoCluster: String, database: String, query: String, conf: Map[String, String] = Map.empty[String, String], cpr: Option[ClientRequestProperties] = None): DataFrame = {
      if (cpr.isDefined) {
        df.option(KustoSourceOptions.KUSTO_CLIENT_REQUEST_PROPERTIES_JSON, cpr.get.toString)
      }

      df.format("com.microsoft.kusto.spark.datasource")
        .option(KustoSourceOptions.KUSTO_CLUSTER, kustoCluster)
        .option(KustoSourceOptions.KUSTO_DATABASE, database)
        .option(KustoSourceOptions.KUSTO_QUERY, query)
        .options(conf)
        .load()
    }
  }

  implicit class DataFrameWriterExtension(df: DataFrameWriter[Row]) {
    def kusto(kustoCluster: String, database: String, table: String, conf: Map[String, String] = Map.empty[String, String], sparkIngestionProperties: Option[SparkIngestionProperties] = None): Unit = {
      if (sparkIngestionProperties.isDefined) {
        df.option(KustoSinkOptions.KUSTO_SPARK_INGESTION_PROPERTIES_JSON, sparkIngestionProperties.get.toString)
      }

      df.format("com.microsoft.kusto.spark.datasource")
      .option(KustoSinkOptions.KUSTO_CLUSTER, kustoCluster)
      .option(KustoSinkOptions.KUSTO_DATABASE, database)
      .option(KustoSinkOptions.KUSTO_TABLE, table)
      .options(conf)
      .mode(SaveMode.Append)
      .save()
    }
  }

  implicit class DataStreamWriterExtension(df: DataStreamWriter[Row]) {
    def kusto(kustoCluster: String, database: String, table: String, conf: Map[String, String] = Map.empty[String, String], sparkIngestionProperties: Option[SparkIngestionProperties] = None): Unit = {
      if (sparkIngestionProperties.isDefined) {
        df.option(KustoSinkOptions.KUSTO_SPARK_INGESTION_PROPERTIES_JSON, sparkIngestionProperties.get.toString)
      }

      df.format("com.microsoft.kusto.spark.datasource")
        .option(KustoSinkOptions.KUSTO_CLUSTER, kustoCluster)
        .option(KustoSinkOptions.KUSTO_DATABASE, database)
        .option(KustoSinkOptions.KUSTO_TABLE, table)
        .options(conf)
    }
  }

} 
Example 3
Source File: Output.scala    From sparta   with Apache License 2.0 5 votes vote down vote up
package com.stratio.sparta.sdk.pipeline.output

import java.io.{Serializable => JSerializable}

import akka.event.slf4j.SLF4JLogging
import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._
import com.stratio.sparta.sdk.properties.{CustomProperties, Parameterizable}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row, SaveMode}

abstract class Output(val name: String, properties: Map[String, JSerializable])
  extends Parameterizable(properties) with SLF4JLogging with CustomProperties {

  val customKey = "saveOptions"
  val customPropertyKey = "saveOptionsKey"
  val customPropertyValue = "saveOptionsValue"
  val propertiesWithCustom = properties ++ getCustomProperties

  def setUp(options: Map[String, String] = Map.empty[String, String]): Unit = {}

  def cleanUp(options: Map[String, String] = Map.empty[String, String]): Unit = {}

  def save(dataFrame: DataFrame, saveMode: SaveModeEnum.Value, options: Map[String, String]): Unit

  def supportedSaveModes: Seq[SaveModeEnum.Value] = SaveModeEnum.allSaveModes

  def validateSaveMode(saveMode: SaveModeEnum.Value): Unit = {
    if (!supportedSaveModes.contains(saveMode))
      log.info(s"Save mode $saveMode selected not supported by the output $name." +
        s" Using the default mode ${SaveModeEnum.Append}"
      )
  }
}

object Output extends SLF4JLogging {

  final val ClassSuffix = "Output"
  final val SparkConfigurationMethod = "getSparkConfiguration"
  final val Separator = "_"
  final val FieldsSeparator = ","
  final val PrimaryKey = "primaryKey"
  final val TableNameKey = "tableName"
  final val PartitionByKey = "partitionBy"
  final val TimeDimensionKey = "timeDimension"
  final val MeasureMetadataKey = "measure"
  final val PrimaryKeyMetadataKey = "pk"

  def getSparkSaveMode(saveModeEnum: SaveModeEnum.Value): SaveMode =
    saveModeEnum match {
      case SaveModeEnum.Append => SaveMode.Append
      case SaveModeEnum.ErrorIfExists => SaveMode.ErrorIfExists
      case SaveModeEnum.Overwrite => SaveMode.Overwrite
      case SaveModeEnum.Ignore => SaveMode.Ignore
      case SaveModeEnum.Upsert => SaveMode.Append
      case _ =>
        log.warn(s"Save Mode $saveModeEnum not supported, using default save mode ${SaveModeEnum.Append}")
        SaveMode.Append
    }

  def getTimeFromOptions(options: Map[String, String]): Option[String] = options.get(TimeDimensionKey).notBlank

  def getPrimaryKeyOptions(options: Map[String, String]): Option[String] = options.get(PrimaryKey).notBlank

  def getTableNameFromOptions(options: Map[String, String]): String =
    options.getOrElse(TableNameKey, {
      log.error("Table name not defined")
      throw new NoSuchElementException("tableName not found in options")
    })

  def applyPartitionBy(options: Map[String, String],
                       dataFrame: DataFrameWriter[Row],
                       schemaFields: Array[StructField]): DataFrameWriter[Row] = {

    options.get(PartitionByKey).notBlank.fold(dataFrame)(partitions => {
      val fieldsInDataFrame = schemaFields.map(field => field.name)
      val partitionFields = partitions.split(",")
      if (partitionFields.forall(field => fieldsInDataFrame.contains(field)))
        dataFrame.partitionBy(partitionFields: _*)
      else {
        log.warn(s"Impossible to execute partition by fields: $partitionFields because the dataFrame not contain all" +
          s" fields. The dataFrame only contains: ${fieldsInDataFrame.mkString(",")}")
        dataFrame
      }
    })
  }

  def defaultTimeStampField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField =
    StructField(fieldName, TimestampType, nullable, metadata)

  def defaultDateField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField =
    StructField(fieldName, DateType, nullable, metadata)

  def defaultStringField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField =
    StructField(fieldName, StringType, nullable, metadata)

  def defaultGeoField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField =
    StructField(fieldName, ArrayType(DoubleType), nullable, metadata)

  def defaultLongField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField =
    StructField(fieldName, LongType, nullable, metadata)
} 
Example 4
Source File: ALSModeling.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
package com.spark.recommendation

import java.util

import com.spark.recommendation.FeatureExtraction.{Rating, parseRating}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.recommendation.ALS
import org.apache.spark.sql.{Row, DataFrame, DataFrameWriter}


object ALSModeling {

  def createALSModel() {
    val ratings = FeatureExtraction.getFeatures();

    val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2))
    println(training.first())

    // Build the recommendation model using ALS on the training data
    val als = new ALS()
      .setMaxIter(5)
      .setRegParam(0.01)
      .setUserCol("userId")
      .setItemCol("movieId")
      .setRatingCol("rating")

    val model = als.fit(training)
    println(model.userFactors.count())
    println(model.itemFactors.count())

    val predictions = model.transform(test)
    println(predictions.printSchema())

    val evaluator = new RegressionEvaluator()
      .setMetricName("rmse")
      .setLabelCol("rating")
      .setPredictionCol("prediction")
    val rmse = evaluator.evaluate(predictions)

    println(s"Root-mean-square error = $rmse")
  }

  def main(args: Array[String]) {
    createALSModel()
  }

} 
Example 5
Source File: CsvOptions.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv

import org.apache.spark.sql.{DataFrameReader, Row, DataFrameWriter}

import ai.deepsense.deeplang.doperations.inout.CsvParameters
import ai.deepsense.deeplang.doperations.inout.CsvParameters.ColumnSeparatorChoice

object CsvOptions {

  def map(
      namesIncluded: Boolean,
      columnSeparator: ColumnSeparatorChoice): Map[String, String] = {
    val headerFlag = if (namesIncluded) "true" else "false"
    Map(
      "header" -> headerFlag,
      "delimiter" -> CsvParameters.determineColumnSeparatorOf(columnSeparator).toString,
      "inferSchema" -> "false"
    )
  }

  // Unfortunately, making analogous RichDataFrameWriter is awkward, if not impossible.
  // This is because between Spark 1.6 and 2.0 DataFrameWriter became parametrized
  implicit class RichDataFrameReader(self: DataFrameReader) {
    def setCsvOptions(
        namesIncluded: Boolean,
        columnSeparator: ColumnSeparatorChoice): DataFrameReader = {
      val paramMap = map(namesIncluded, columnSeparator)
      paramMap.foldLeft(self) { case (reader, (key, value)) =>
        reader.option(key, value)
      }
    }
  }


} 
Example 6
Source File: package.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus

import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame, Row }
import org.apache.spark.sql.types.StructType

package object ply {

  
  implicit class PlyDataFrameReader(reader: DataFrameReader) {
    def ply: String => DataFrame = reader.format("fr.ign.spark.iqmulus.ply").load
  }

  implicit class PlyDataFrame(df: DataFrame) {
    def saveAsPly(location: String, littleEndian: Boolean = true) = {
      val df_id = df.drop("pid").drop("fid")
      val schema = df_id.schema
      val saver = (key: Int, iter: Iterator[Row]) =>
        Iterator(iter.saveAsPly(s"$location/$key.ply", schema, littleEndian))
      df_id.rdd.mapPartitionsWithIndex(saver, true).collect
    }
  }

  implicit class PlyRowIterator(iter: Iterator[Row]) {
    def saveAsPly(
      filename: String,
      schema: StructType,
      littleEndian: Boolean
    ) = {
      val path = new org.apache.hadoop.fs.Path(filename)
      val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration)
      val f = fs.create(path)
      val rows = iter.toArray
      val count = rows.size.toLong
      val header = new PlyHeader(filename, littleEndian, Map("vertex" -> ((count, schema))))
      val dos = new java.io.DataOutputStream(f);
      dos.write(header.toString.getBytes)
      val ros = new RowOutputStream(dos, littleEndian, schema)
      rows.foreach(ros.write)
      dos.close
      header
    }
  }
} 
Example 7
Source File: package.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus

import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame, Row }
import org.apache.spark.sql.types.{ FloatType, StructType }

package object xyz {

  
  implicit class XyzDataFrameReader(reader: DataFrameReader) {
    def xyz: String => DataFrame = reader.format("fr.ign.spark.iqmulus.xyz").load
  }

  implicit class XyzDataFrame(df: DataFrame) {
    def saveAsXyz(location: String) = {
      val df_id = df.drop("id")
      require(df_id.schema.fieldNames.take(3) sameElements Array("x", "y", "z"))
      require(df_id.schema.fields.map(_.dataType).take(3).forall(_ == FloatType))
      val saver = (key: Int, iter: Iterator[Row]) => Iterator(iter.saveXyz(s"$location/$key.xyz"))
      df_id.rdd.mapPartitionsWithIndex(saver, true).collect
    }
  }

  implicit class XyzRowIterator(iter: Iterator[Row]) {
    def saveXyz(filename: String) = {
      val path = new org.apache.hadoop.fs.Path(filename)
      val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration)
      val f = fs.create(path)
      val dos = new java.io.DataOutputStream(f)
      var count = 0L
      iter.foreach(row => { count += 1; dos.writeBytes(row.mkString("", "\t", "\n")) })
      dos.close
      (filename, count)
    }
  }
} 
Example 8
Source File: package.scala    From spark-iqmulus   with Apache License 2.0 5 votes vote down vote up
package fr.ign.spark.iqmulus

import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame }
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.Row

package object las {

  
  implicit class LasDataFrameReader(reader: DataFrameReader) {
    def las: String => DataFrame = reader.format("fr.ign.spark.iqmulus.las").load
  }

  implicit class LasDataFrame(df: DataFrame) {
    def saveAsLas(
      location: String,
      formatOpt: Option[Byte] = None,
      version: Version = Version(),
      scale: Array[Double] = Array(0.01, 0.01, 0.01),
      offset: Array[Double] = Array(0, 0, 0)
    ) = {
      val format = formatOpt.getOrElse(LasHeader.formatFromSchema(df.schema))
      val schema = LasHeader.schema(format) // no user types for now
      val cols = schema.fieldNames.intersect(df.schema.fieldNames)
      val saver = (key: Int, iter: Iterator[Row]) =>
        Iterator(iter.saveAsLas(s"$location/$key.las", schema, format, scale, offset, version))
      df.select(cols.head, cols.tail: _*).rdd.mapPartitionsWithIndex(saver, true).collect
    }
  }

  implicit class LasRowIterator(iter: Iterator[Row]) {
    def saveAsLas(
      filename: String, schema: StructType, format: Byte,
      scale: Array[Double], offset: Array[Double], version: Version = Version()
    ) = {
      // materialize the partition to access it in a single pass, TODO workaround that 
      val rows = iter.toArray
      val count = rows.length.toLong
      val pmin = Array.fill[Double](3)(Double.PositiveInfinity)
      val pmax = Array.fill[Double](3)(Double.NegativeInfinity)
      val countByReturn = Array.fill[Long](15)(0)
      rows.foreach { row =>
        val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble
        val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble
        val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble
        val ret = row.getAs[Byte]("flags") & 0x3
        countByReturn(ret) += 1
        pmin(0) = Math.min(pmin(0), x)
        pmin(1) = Math.min(pmin(1), y)
        pmin(2) = Math.min(pmin(2), z)
        pmax(0) = Math.max(pmax(0), x)
        pmax(1) = Math.max(pmax(1), y)
        pmax(2) = Math.max(pmax(2), z)
      }
      val path = new org.apache.hadoop.fs.Path(filename)
      val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration)
      val f = fs.create(path)
      val header = new LasHeader(filename, format, count, pmin, pmax, scale, offset,
        version = version, pdr_return_nb = countByReturn)
      val dos = new java.io.DataOutputStream(f);
      header.write(dos)
      val ros = new RowOutputStream(dos, littleEndian = true, schema)
      rows.foreach(ros.write)
      dos.close
      header
    }
  }
} 
Example 9
Source File: CsvOptions.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv

import org.apache.spark.sql.{DataFrameReader, Row, DataFrameWriter}

import io.deepsense.deeplang.doperations.inout.CsvParameters
import io.deepsense.deeplang.doperations.inout.CsvParameters.ColumnSeparatorChoice

object CsvOptions {

  def map(
      namesIncluded: Boolean,
      columnSeparator: ColumnSeparatorChoice): Map[String, String] = {
    val headerFlag = if (namesIncluded) "true" else "false"
    Map(
      "header" -> headerFlag,
      "delimiter" -> CsvParameters.determineColumnSeparatorOf(columnSeparator).toString,
      "inferSchema" -> "false"
    )
  }

  // Unfortunately, making analogous RichDataFrameWriter is awkward, if not impossible.
  // This is because between Spark 1.6 and 2.0 DataFrameWriter became parametrized
  implicit class RichDataFrameReader(self: DataFrameReader) {
    def setCsvOptions(
        namesIncluded: Boolean,
        columnSeparator: ColumnSeparatorChoice): DataFrameReader = {
      val paramMap = map(namesIncluded, columnSeparator)
      paramMap.foldLeft(self) { case (reader, (key, value)) =>
        reader.option(key, value)
      }
    }
  }


}