org.apache.spark.sql.SaveMode Scala Examples

The following examples show how to use org.apache.spark.sql.SaveMode. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: DefaultSource.scala From spark-snowflake with Apache License 2.0

7 votes

package net.snowflake.spark.snowflake

import net.snowflake.spark.snowflake.streaming.SnowflakeSink
import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_SHORT_NAME
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.sources._
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
import org.slf4j.LoggerFactory


  override def createRelation(sqlContext: SQLContext,
                              saveMode: SaveMode,
                              parameters: Map[String, String],
                              data: DataFrame): BaseRelation = {

    val params = Parameters.mergeParameters(parameters)
    // check spark version for push down
    if (params.autoPushdown) {
      SnowflakeConnectorUtils.checkVersionAndEnablePushdown(
        sqlContext.sparkSession
      )
    }
    // pass parameters to pushdown functions
    pushdowns.setGlobalParameter(params)
    val table = params.table.getOrElse {
      throw new IllegalArgumentException(
        "For save operations you must specify a Snowfake table name with the 'dbtable' parameter"
      )
    }

    def tableExists: Boolean = {
      val conn = jdbcWrapper.getConnector(params)
      try {
        jdbcWrapper.tableExists(conn, table.toString)
      } finally {
        conn.close()
      }
    }

    val (doSave, dropExisting) = saveMode match {
      case SaveMode.Append => (true, false)
      case SaveMode.Overwrite => (true, true)
      case SaveMode.ErrorIfExists =>
        if (tableExists) {
          sys.error(
            s"Table $table already exists! (SaveMode is set to ErrorIfExists)"
          )
        } else {
          (true, false)
        }
      case SaveMode.Ignore =>
        if (tableExists) {
          log.info(s"Table $table already exists -- ignoring save request.")
          (false, false)
        } else {
          (true, false)
        }
    }

    if (doSave) {
      val updatedParams = parameters.updated("overwrite", dropExisting.toString)
      new SnowflakeWriter(jdbcWrapper)
        .save(
          sqlContext,
          data,
          saveMode,
          Parameters.mergeParameters(updatedParams)
        )

    }

    createRelation(sqlContext, parameters)
  }

  override def createSink(sqlContext: SQLContext,
                          parameters: Map[String, String],
                          partitionColumns: Seq[String],
                          outputMode: OutputMode): Sink =
    new SnowflakeSink(sqlContext, parameters, partitionColumns, outputMode)
}

Example 2

Source File: JdbcRelationProvider.scala From drizzle-spark with Apache License 2.0

7 votes

package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val partitionColumn = jdbcOptions.partitionColumn
    val lowerBound = jdbcOptions.lowerBound
    val upperBound = jdbcOptions.upperBound
    val numPartitions = jdbcOptions.numPartitions

    val partitionInfo = if (partitionColumn == null) {
      null
    } else {
      JDBCPartitioningInfo(
        partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt)
    }
    val parts = JDBCRelation.columnPartition(partitionInfo)
    JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val url = jdbcOptions.url
    val table = jdbcOptions.table
    val createTableOptions = jdbcOptions.createTableOptions
    val isTruncate = jdbcOptions.isTruncate

    val conn = JdbcUtils.createConnectionFactory(jdbcOptions)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, url, table)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (isTruncate && isCascadingTruncateTable(url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, table)
              saveTable(df, url, table, jdbcOptions)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, table)
              createTable(df.schema, url, table, createTableOptions, conn)
              saveTable(df, url, table, jdbcOptions)
            }

          case SaveMode.Append =>
            saveTable(df, url, table, jdbcOptions)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '$table' already exists. SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(df.schema, url, table, createTableOptions, conn)
        saveTable(df, url, table, jdbcOptions)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
}

Example 3

Source File: OnErrorSuite.scala From spark-snowflake with Apache License 2.0

6 votes

package net.snowflake.spark.snowflake

import net.snowflake.client.jdbc.SnowflakeSQLException
import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME
import org.apache.spark.sql.{DataFrame, Row, SaveMode}
import org.apache.spark.sql.types.{StringType, StructField, StructType}

class OnErrorSuite extends IntegrationSuiteBase {
  lazy val table = s"spark_test_table_$randomSuffix"

  lazy val schema = new StructType(
    Array(StructField("var", StringType, nullable = false))
  )

  lazy val df: DataFrame = sparkSession.createDataFrame(
    sc.parallelize(
      Seq(Row("{\"dsadas\nadsa\":12311}"), Row("{\"abc\":334}")) // invalid json key
    ),
    schema
  )

  override def beforeAll(): Unit = {
    super.beforeAll()
    jdbcUpdate(s"create or replace table $table(var variant)")
  }

  override def afterAll(): Unit = {
    jdbcUpdate(s"drop table $table")
    super.afterAll()
  }

  test("continue_on_error off") {

    assertThrows[SnowflakeSQLException] {
      df.write
        .format(SNOWFLAKE_SOURCE_NAME)
        .options(connectorOptionsNoTable)
        .option("dbtable", table)
        .mode(SaveMode.Append)
        .save()
    }
  }

  test("continue_on_error on") {
    df.write
      .format(SNOWFLAKE_SOURCE_NAME)
      .options(connectorOptionsNoTable)
      .option("continue_on_error", "on")
      .option("dbtable", table)
      .mode(SaveMode.Append)
      .save()

    val result = sparkSession.read
      .format(SNOWFLAKE_SOURCE_NAME)
      .options(connectorOptionsNoTable)
      .option("dbtable", table)
      .load()

    assert(result.collect().length == 1)
  }

}

Example 4

Source File: SFTableNameSuite.scala From spark-snowflake with Apache License 2.0

5 votes

package net.snowflake.spark.snowflake

import net.snowflake.spark.snowflake.DefaultJDBCWrapper.DataBaseOperations
import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_SHORT_NAME
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.sql.types.{
  IntegerType,
  StringType,
  StructField,
  StructType
}

class SFTableNameSuite extends IntegrationSuiteBase {
  lazy val tableName = s""""spark_test_table_$randomSuffix""""

  override def afterAll(): Unit = {
    conn.dropTable(tableName)
    super.afterAll()
  }

  test("table name include \"") {
    val schema = StructType(
      List(StructField("num", IntegerType), StructField("str", StringType))
    )

    val data = sc.parallelize(Seq(Row(1, "a"), Row(2, "b")))

    val df = sparkSession.createDataFrame(data, schema)

    df.write
      .format(SNOWFLAKE_SOURCE_SHORT_NAME)
      .options(connectorOptions)
      .option("dbtable", tableName)
      .mode(SaveMode.Overwrite)
      .save()

    val result = sparkSession.read
      .format(SNOWFLAKE_SOURCE_SHORT_NAME)
      .options(connectorOptions)
      .option("dbtable", tableName)
      .load()
      .count()

    assert(result == 2)

  }
}

Example 5

Source File: BigQueryImporter.scala From pg2bq with MIT License

5 votes

package com.powerspace.pg2bq

import com.google.cloud.bigquery.JobInfo.WriteDisposition
import com.google.cloud.bigquery._
import com.typesafe.scalalogging.LazyLogging
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}

class BigQueryImporter(spark: SparkSession, tmpBucket: String, dataset: String) extends LazyLogging with DataImporter {

  val bigquery: BigQuery = BigQueryOptions.getDefaultInstance.getService

  // ensure the dataset exists or create it
  getOrCreateDataset(dataset)

  override def createOrOverride(df: DataFrame, tableName: String): Unit = {
    saveIntoGcs(df, tableName)
    loadFromGcsToBq(tableName)
  }

  private def loadFromGcsToBq(tableName: String): Unit = {
    val configuration = LoadJobConfiguration
      .builder(TableId.of(dataset, tableName), s"gs://$tmpBucket/$tableName/*.avro")
      .setFormatOptions(FormatOptions.avro())
      .setWriteDisposition(WriteDisposition.WRITE_TRUNCATE)
      .build()

    val job = bigquery.create(JobInfo.newBuilder(configuration).build())

    logger.info(s"Importing $tableName from bucket $tmpBucket to dataset $dataset...")
    job.waitFor()
    logger.info(s"$tableName import done!")
  }

  private def saveIntoGcs(df: DataFrame, tableName: String): Unit = {
    df.write
      .mode(SaveMode.Overwrite)
      .format("com.databricks.spark.avro")
      .save(s"gs://$tmpBucket/$tableName")
  }

  def getOrCreateDataset(datasetName: String): Dataset = {
    scala.Option(bigquery.getDataset(datasetName)) match {
      case Some(ds) =>
        logger.info(s"Dataset $datasetName already exist.")
        ds
      case None =>
        logger.info(s"Dataset $datasetName does not exist, creating...")
        val ds = bigquery.create(DatasetInfo.of(datasetName))
        logger.info(s"Dataset $datasetName created!")
        ds
    }
  }

}

Example 6

Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Example 7

Source File: CarbonDataFrameExample.scala From CarbonDataLearning with GNU General Public License v3.0

5 votes

package org.github.xubo245.carbonDataLearning.example

import org.apache.carbondata.examples.util.ExampleUtils
import org.apache.spark.sql.{SaveMode, SparkSession}

object CarbonDataFrameExample {

  def main(args: Array[String]) {
    val spark = ExampleUtils.createCarbonSession("CarbonDataFrameExample")
    exampleBody(spark)
    spark.close()
  }

  def exampleBody(spark : SparkSession): Unit = {
    // Writes Dataframe to CarbonData file:
    import spark.implicits._
    val df = spark.sparkContext.parallelize(1 to 100)
      .map(x => ("a" + x % 10, "b", x))
      .toDF("c1", "c2", "number")

    // Saves dataframe to carbondata file
    df.write
      .format("carbondata")
      .option("tableName", "carbon_df_table")
      .option("partitionColumns", "c1")  // a list of column names
      .mode(SaveMode.Overwrite)
      .save()

    spark.sql(""" SELECT * FROM carbon_df_table """).show()

    spark.sql("SHOW PARTITIONS carbon_df_table").show()

    // Specify schema
    import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
    val customSchema = StructType(Array(
      StructField("c1", StringType),
      StructField("c2", StringType),
      StructField("number", IntegerType)))

    // Reads carbondata to dataframe
    val carbondf = spark.read
      .format("carbondata")
      .schema(customSchema)
      // .option("dbname", "db_name") the system will use "default" as dbname if not set this option
      .option("tableName", "carbon_df_table")
      .load()

    df.write
      .format("csv")
      .option("tableName", "csv_df_table")
      .option("partitionColumns", "c1") // a list of column names
      //      .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
      .mode(SaveMode.Overwrite)
      .csv("/Users/xubo/Desktop/xubo/git/carbondata3/examples/spark2/target/csv/1.csv")

    // Reads carbondata to dataframe
    val carbondf2 = spark.read
      .format("csv")
      .schema(customSchema)
      // .option("dbname", "db_name") the system will use "default" as dbname if not set this option
      .option("tableName", "csv_df_table")

      //      .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
      .load("/Users/xubo/Desktop/xubo/git/carbondata3/examples/spark2/target/csv")

    carbondf2.show()


    // Dataframe operations
    carbondf.printSchema()
    carbondf.select($"c1", $"number" + 10).show()
    carbondf.filter($"number" > 31).show()

    spark.sql("DROP TABLE IF EXISTS carbon_df_table")
  }
}

Example 8

Source File: SFObjectWriter.scala From spark-salesforce with Apache License 2.0

5 votes

package com.springml.spark.salesforce

import org.apache.log4j.Logger
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SaveMode}
import com.springml.salesforce.wave.api.APIFactory
import com.springml.salesforce.wave.api.BulkAPI
import com.springml.salesforce.wave.util.WaveAPIConstants
import com.springml.salesforce.wave.model.JobInfo



class SFObjectWriter (
    val username: String,
    val password: String,
    val login: String,
    val version: String,
    val sfObject: String,
    val mode: SaveMode,
    val upsert: Boolean,
    val externalIdFieldName: String,
    val csvHeader: String
    ) extends Serializable {

  @transient val logger = Logger.getLogger(classOf[SFObjectWriter])

  def writeData(rdd: RDD[Row]): Boolean = {
    val csvRDD = rdd.map(row => row.toSeq.map(value => Utils.rowValue(value)).mkString(","))

    val jobInfo = new JobInfo(WaveAPIConstants.STR_CSV, sfObject, operation(mode, upsert))
    jobInfo.setExternalIdFieldName(externalIdFieldName)

    val jobId = bulkAPI.createJob(jobInfo).getId

    csvRDD.mapPartitionsWithIndex {
      case (index, iterator) => {
        val records = iterator.toArray.mkString("\n")
        var batchInfoId : String = null
        if (records != null && !records.isEmpty()) {
          val data = csvHeader + "\n" + records
          val batchInfo = bulkAPI.addBatch(jobId, data)
          batchInfoId = batchInfo.getId
        }

        val success = (batchInfoId != null)
        // Job status will be checked after completing all batches
        List(success).iterator
      }
    }.reduce((a, b) => a & b)

    bulkAPI.closeJob(jobId)
    var i = 1
    while (i < 999999) {
      if (bulkAPI.isCompleted(jobId)) {
        logger.info("Job completed")
        return true
      }

      logger.info("Job not completed, waiting...")
      Thread.sleep(200)
      i = i + 1
    }

    print("Returning false...")
    logger.info("Job not completed. Timeout..." )
    false
  }

  // Create new instance of BulkAPI every time because Spark workers cannot serialize the object
  private def bulkAPI(): BulkAPI = {
    APIFactory.getInstance().bulkAPI(username, password, login, version)
  }

  private def operation(mode: SaveMode, upsert: Boolean): String = {
    if (upsert) {
      "upsert"
    } else if (mode != null && SaveMode.Overwrite.name().equalsIgnoreCase(mode.name())) {
      WaveAPIConstants.STR_UPDATE
    } else if (mode != null && SaveMode.Append.name().equalsIgnoreCase(mode.name())) {
      WaveAPIConstants.STR_INSERT
    } else {
      logger.warn("SaveMode " + mode + " Not supported. Using 'insert' operation")
      WaveAPIConstants.STR_INSERT
    }
  }

}

Example 9

Source File: DefaultSource.scala From spark-cdm with MIT License

5 votes

package com.microsoft.cdm

import java.util.Optional

import com.microsoft.cdm.read.CDMDataSourceReader
import com.microsoft.cdm.utils.{AADProvider, ADLGen2Provider, Constants, DataConverter}
import com.microsoft.cdm.write.CDMDataSourceWriter
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.sources.v2._
import org.apache.spark.sql.sources.v2.writer.DataSourceWriter
import org.apache.spark.sql.types.StructType


  def createWriter(jobId: String,
                   schema: StructType,
                   mode: SaveMode,
                   options: DataSourceOptions): Optional[DataSourceWriter] = {
    val modelDirectory = options.get("cdmFolder").get()
    val modelName = options.get("cdmModelName").get()
    val entity = options.get("entity").get()

    Optional.of(new CDMDataSourceWriter(jobId,
      schema, mode, getDataStorage(options), modelDirectory, modelName, entity, new DataConverter()))
  }
}

Example 10

Source File: CDMDataSourceWriter.scala From spark-cdm with MIT License

5 votes

package com.microsoft.cdm.write

import com.microsoft.cdm.utils._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriterFactory, WriterCommitMessage}
import org.apache.spark.sql.types.StructType


class CDMDataSourceWriter(val jobId: String,
                          val schema: StructType,
                          val mode: SaveMode,
                          val adlProvider: ADLGen2Provider,
                          val modelDirectory: String,
                          val modelName: String,
                          val entityName: String,
                          val dataConvert: DataConverter) extends DataSourceWriter {

  def createWriterFactory: DataWriterFactory[InternalRow] = {
    new CDMDataWriterFactory(adlProvider, schema, jobId, modelDirectory, entityName)
  }

  private val createNewModel = (modelUri: String, attributes: Seq[CDMAttribute], partitions: Seq[CDMPartition]) => {
    CDMModel.createNewModel(modelName, entityName, attributes, partitions).toJson
  }

  private val appendExistingModel = (modelUri: String, attributes: Seq[CDMAttribute], partitions: Seq[CDMPartition]) => {
    val existingModel = new CDMModel(adlProvider.getFullFile(modelUri))
    existingModel.appendOrReplaceEntity(entityName, attributes, partitions)
    existingModel.toJson
  }

  def commit(messages: Array[WriterCommitMessage]): Unit = {
    val partitions = messages.map{ message =>
      val csvMsg = message.asInstanceOf[CSVCommitMessage]
      new CDMPartition(name=csvMsg.name, location=csvMsg.csvLocation)
    }

    val attributes = schema.map{ col =>
      new CDMAttribute(col.name, dataConvert.toCdmType(col.dataType))
    }

    // Check if there's an existing model in this directory to append to
    val modelUri = adlProvider.getModelJsonInDirectory(modelDirectory)
    val modelJson = (if(adlProvider.fileExists(modelUri)) appendExistingModel else createNewModel)(modelUri,
      attributes, partitions)

    adlProvider.uploadData(modelJson, modelUri)
  }

  // TODO: error handling
  def abort(messages: Array[WriterCommitMessage]): Unit = {}

}

Example 11

Source File: PutHiveMode.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.hive

import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.{SaveMode, SparkSession}

class PutHiveMode extends ConfigurableStop {

  val authorEmail: String = "[email protected]"
  val description: String = "Modes for saving data hive"
  val inportList: List[String] = List(Port.DefaultPort)
  val outportList: List[String] = List(Port.DefaultPort)

  var database:String = _
  var table:String = _
  var saveMode:String = _

  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()
    val inDF = in.read()

    inDF.write.format("hive").mode(saveMode).saveAsTable(database + "." + table)
  }

  def initialize(ctx: ProcessContext): Unit = {

  }

  def setProperties(map : Map[String, Any]) = {
    database = MapUtil.get(map,"database").asInstanceOf[String]
    table = MapUtil.get(map,"table").asInstanceOf[String]
    saveMode = MapUtil.get(map,"saveMode").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {

    val saveModeOption = Set("append","overwrite","error","ignore")
    var descriptor : List[PropertyDescriptor] = List()

    val database=new PropertyDescriptor()
      .name("database")
      .displayName("DataBase")
      .description("The database name")
      .defaultValue("")
      .required(true)
      .example("test")
    descriptor = database :: descriptor

    val table = new PropertyDescriptor()
      .name("table")
      .displayName("Table")
      .description("The table name")
      .defaultValue("")
      .required(true)
      .example("student")
    descriptor = table :: descriptor

    val saveMode = new PropertyDescriptor()
      .name("saveMode")
      .displayName("SaveMode")
      .description("The save mode for table")
      .allowableValues(saveModeOption)
      .defaultValue("append")
      .required(true)
      .example("append")
    descriptor = saveMode :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/hive/PutHiveMode.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.HiveGroup)
  }


}

Example 12

Source File: MysqlWrite.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.jdbc

import java.util.Properties

import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.{SaveMode, SparkSession}

import scala.beans.BeanProperty

class MysqlWrite extends ConfigurableStop{

  val authorEmail: String = "[email protected]"
  val description: String = "Write data to mysql database with jdbc"
  val inportList: List[String] = List(Port.DefaultPort)
  val outportList: List[String] = List(Port.DefaultPort)

  var url:String = _
  var user:String = _
  var password:String = _
  var dbtable:String = _

  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()
    val jdbcDF = in.read()
    val properties = new Properties()
    properties.put("user", user)
    properties.put("password", password)
    jdbcDF.write.mode(SaveMode.Append).jdbc(url,dbtable,properties)
    out.write(jdbcDF)
  }

  def initialize(ctx: ProcessContext): Unit = {

  }

  override def setProperties(map: Map[String, Any]): Unit = {
    url = MapUtil.get(map,"url").asInstanceOf[String]
    user = MapUtil.get(map,"user").asInstanceOf[String]
    password = MapUtil.get(map,"password").asInstanceOf[String]
    dbtable = MapUtil.get(map,"dbtable").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()

    val url=new PropertyDescriptor()
      .name("url")
      .displayName("Url")
      .description("The Url, for example jdbc:mysql://127.0.0.1/dbname")
      .defaultValue("")
      .required(true)
      .example("jdbc:mysql://127.0.0.1/dbname")
    descriptor = url :: descriptor

    val user=new PropertyDescriptor()
      .name("user")
      .displayName("User")
      .description("The user name of database")
      .defaultValue("")
      .required(true)
      .example("root")
    descriptor = user :: descriptor

    val password=new PropertyDescriptor()
      .name("password")
      .displayName("Password")
      .description("The password of database")
      .defaultValue("")
      .required(true)
      .example("123456")
      .sensitive(true)
    descriptor = password :: descriptor

    val dbtable=new PropertyDescriptor()
      .name("dbtable")
      .displayName("DBTable")
      .description("The table you want to write")
      .defaultValue("")
      .required(true)
      .example("test")
    descriptor = dbtable :: descriptor

    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/jdbc/MysqlWrite.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.JdbcGroup)
  }


}

Example 13

Source File: JsonSave.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.json

import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.SaveMode

import scala.beans.BeanProperty

class JsonSave extends ConfigurableStop{

  val authorEmail: String = "[email protected]"
  val description: String = "Save data into json file"
  val inportList: List[String] = List(Port.DefaultPort)
  val outportList: List[String] = List(Port.DefaultPort)

  var jsonSavePath: String = _

  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val jsonDF = in.read()
    jsonDF.write.format("json").mode(SaveMode.Overwrite).save(jsonSavePath)
  }

  def initialize(ctx: ProcessContext): Unit = {

  }

  override def setProperties(map: Map[String, Any]): Unit = {
    jsonSavePath = MapUtil.get(map,"jsonSavePath").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val jsonSavePath = new PropertyDescriptor()
      .name("jsonSavePath")
      .displayName("JsonSavePath")
      .description("The save path of the json file")
      .defaultValue("")
      .required(true)
        .example("hdfs://192.168.3.138:8020/work/testJson/test/")

    descriptor = jsonSavePath :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/json/JsonSave.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.JsonGroup)
  }


}

Example 14

Source File: ReadFromRedisTest.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.redis

import java.net.InetAddress

import cn.piflow.Runner
import cn.piflow.conf.bean.FlowBean
import cn.piflow.conf.util.{FileUtil, OptionUtil}
import cn.piflow.util.{PropertyUtil, ServerIpUtil}
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.h2.tools.Server
import org.junit.Test

import scala.util.parsing.json.JSON

class ReadFromRedisTest {

  @Test
  def testFlow(): Unit ={

    //parse flow json
    val file = "src/main/resources/flow/redis/ReadFromRedis.json"
    val flowJsonStr = FileUtil.fileReader(file)
    val map = OptionUtil.getAny(JSON.parseFull(flowJsonStr)).asInstanceOf[Map[String, Any]]
    println(map)

    //create flow
    val flowBean = FlowBean(map)
    val flow = flowBean.constructFlow()

    val ip = InetAddress.getLocalHost.getHostAddress

    cn.piflow.util.FileUtil.writeFile("server.ip=" + ip, ServerIpUtil.getServerIpFile())
    val h2Server = Server.createTcpServer("-tcp", "-tcpAllowOthers", "-tcpPort", "50001").start()

    //execute flow
    val spark = SparkSession.builder()
      .master("local[*]")
      .appName("CsvParserTest")
      .config("spark.driver.memory", "1g")
      .config("spark.executor.memory", "2g")
      .config("spark.cores.max", "2")
      .config("hive.metastore.uris",PropertyUtil.getPropertyValue("hive.metastore.uris"))
      .enableHiveSupport()
      .getOrCreate()

    val process = Runner.create()
      .bind(classOf[SparkSession].getName, spark)
      .bind("checkpoint.path", "")
      .bind("debug.path","")
      .start(flow);

    process.awaitTermination();
    val pid = process.pid();
    println(pid + "!!!!!!!!!!!!!!!!!!!!!")
    spark.close();
  }


}

Example 15

Source File: ScalaRiakParquetExample.scala From spark-riak-connector with Apache License 2.0

5 votes

package com.basho.riak.spark.examples.parquet
import org.apache.spark.sql.{SaveMode, SQLContext}
import org.apache.spark.{SparkContext, SparkConf}


object ScalaRiakParquetExample {
  case class TSData(site: String, species: String, measurementDate: Long, latitude: Double, longitude: Double, value: Double )
  val startDate = System.currentTimeMillis()
  val endDate = startDate + 100
  val tableName = "parquet_demo"
  val parquetFileName = "riak-ts-data.parquet"

  val testData = Seq(
    TSData("MY7", "PM10", startDate, 51.52254, -0.15459, 41.4),
    TSData("MY7", "PM10", startDate + 10, 51.52254, -0.15459, 41.2),
    TSData("MY7", "PM10", startDate + 20, 51.52254, -0.15459, 39.1),
    TSData("MY7", "PM10", startDate + 30, 51.52254, -0.15459, 39.5),
    TSData("MY7", "PM10", startDate + 40, 51.52254, -0.15459, 29.9),
    TSData("MY7", "PM10", startDate + 50, 51.52254, -0.15459, 34.2),
    TSData("MY7", "PM10", startDate + 60, 51.52254, -0.15459, 28.5),
    TSData("MY7", "PM10", startDate + 70, 51.52254, -0.15459, 39.6),
    TSData("MY7", "PM10", startDate + 80, 51.52254, -0.15459, 29.2),
    TSData("MY7", "PM10", startDate + 90, 51.52254, -0.15459, 31.3)
  )

  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf()
      .setAppName("Simple Scala Riak TS Demo")

    setSparkOpt(sparkConf, "spark.master", "local")
    setSparkOpt(sparkConf, "spark.riak.connection.host", "127.0.0.1:8087")
    println(s"Test data start time: $startDate")

    val sc = new SparkContext(sparkConf)
    val sqlCtx = SQLContext.getOrCreate(sc)

    import sqlCtx.implicits._

    val rdd = sc.parallelize(testData)
    rdd.toDF().write.format("org.apache.spark.sql.riak")
      .mode(SaveMode.Append).save(tableName)

    val df = sqlCtx.read.format("org.apache.spark.sql.riak")
      .load(tableName).registerTempTable(tableName)

    val from = (startDate / 1000).toInt
    val query = s"select * from $tableName where measurementDate >= CAST($from AS TIMESTAMP) " +
      s"AND measurementDate <= CAST(${from + 1} AS TIMESTAMP) AND site = 'MY7' AND species = 'PM10'"

    println(s"Query: $query")
    val rows = sqlCtx.sql(query)
    rows.show()
    val schema = rows.schema

    rows.write.mode("overwrite").parquet(parquetFileName)
    println(s"Data was successfully saved to Parquet file: $parquetFileName")

    val parquetFile = sqlCtx.read.parquet(parquetFileName)
    parquetFile.registerTempTable("parquetFile")
    val data = sqlCtx.sql("SELECT MAX(value) max_value FROM parquetFile ")

    println("Maximum value retrieved from Parquet file:")
    data.show()

  }

  private def setSparkOpt(sparkConf: SparkConf, option: String, defaultOptVal: String): SparkConf = {
    val optval = sparkConf.getOption(option).getOrElse(defaultOptVal)
    sparkConf.set(option, optval)
  }
}

Example 16

Source File: SolrDataFrameImplicits.scala From spark-solr with Apache License 2.0

5 votes

package com.lucidworks.spark.util

import org.apache.spark.sql.{DataFrameReader, DataFrameWriter, Row, SaveMode}


object SolrDataFrameImplicits {

  implicit class SolrReader(reader: DataFrameReader) {
    def solr(collection: String, query: String = "*:*") =
      reader.format("solr").option("collection", collection).option("query", query).load()
    def solr(collection: String, options: Map[String, String]) =
      reader.format("solr").option("collection", collection).options(options).load()
  }

  implicit class SolrWriter(writer: DataFrameWriter[Row]) {
    def solr(collectionName: String, softCommitSecs: Int = 10, overwrite: Boolean = false, format: String = "solr") = {
      writer
        .format(format)
        .option("collection", collectionName)
        .option("soft_commit_secs", softCommitSecs.toString)
        .mode(if(overwrite) SaveMode.Overwrite else SaveMode.Append)
        .save()
    }
  }
}

Example 17

Source File: DefaultSource.scala From spark-solr with Apache License 2.0

5 votes

package solr

import com.lucidworks.spark.{SolrRelation, SolrStreamWriter}
import com.lucidworks.spark.util.Constants
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.streaming.OutputMode

class DefaultSource extends RelationProvider with CreatableRelationProvider with StreamSinkProvider with DataSourceRegister {

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
    try {
      new SolrRelation(parameters, sqlContext.sparkSession)
    } catch {
      case re: RuntimeException => throw re
      case e: Exception => throw new RuntimeException(e)
    }
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    try {
      // TODO: What to do with the saveMode?
      val solrRelation: SolrRelation = new SolrRelation(parameters, Some(df), sqlContext.sparkSession)
      solrRelation.insert(df, overwrite = true)
      solrRelation
    } catch {
      case re: RuntimeException => throw re
      case e: Exception => throw new RuntimeException(e)
    }
  }

  override def shortName(): String = Constants.SOLR_FORMAT

  override def createSink(
      sqlContext: SQLContext,
      parameters: Map[String, String],
      partitionColumns: Seq[String],
      outputMode: OutputMode): Sink = {
    new SolrStreamWriter(sqlContext.sparkSession, parameters, partitionColumns, outputMode)
  }
}

Example 18

Source File: HDFSBase.scala From daf with BSD 3-Clause "New" or "Revised" License

5 votes

package daf.util

import better.files.{ File, _ }
import daf.util.DataFrameClasses.{ Address, Person }
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.hdfs.{ HdfsConfiguration, MiniDFSCluster }
import org.apache.hadoop.test.PathUtils
import org.apache.spark.sql.{ SaveMode, SparkSession }
import org.scalatest.{ BeforeAndAfterAll, FlatSpec, Matchers }
import org.slf4j.LoggerFactory

import scala.util.{ Failure, Random, Try }

abstract class HDFSBase extends FlatSpec with Matchers with BeforeAndAfterAll {

  var miniCluster: Try[MiniDFSCluster] = Failure[MiniDFSCluster](new Exception)

  var fileSystem: Try[FileSystem] = Failure[FileSystem](new Exception)

  val sparkSession: SparkSession = SparkSession.builder().master("local").getOrCreate()

  val alogger = LoggerFactory.getLogger(this.getClass)

  val (testDataPath, confPath) = {
    val testDataPath = s"${PathUtils.getTestDir(this.getClass).getCanonicalPath}/MiniCluster"
    val confPath = s"$testDataPath/conf"
    (
      testDataPath.toFile.createIfNotExists(asDirectory = true, createParents = false),
      confPath.toFile.createIfNotExists(asDirectory = true, createParents = false)
    )
  }

  def pathAvro = "opendata/test.avro"
  def pathParquet = "opendata/test.parquet"
  def pathCsv = "opendata/test.csv"

  def getSparkSession = sparkSession

  override def beforeAll(): Unit = {

    val conf = new HdfsConfiguration()
    conf.setBoolean("dfs.permissions", true)
    System.clearProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA)

    conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, testDataPath.pathAsString)
    //FileUtil.fullyDelete(testDataPath.toJava)

    conf.set(s"hadoop.proxyuser.${System.getProperties.get("user.name")}.groups", "*")
    conf.set(s"hadoop.proxyuser.${System.getProperties.get("user.name")}.hosts", "*")

    val builder = new MiniDFSCluster.Builder(conf)
    miniCluster = Try(builder.build())
    fileSystem = miniCluster.map(_.getFileSystem)
    fileSystem.foreach(fs => {
      val confFile: File = confPath / "hdfs-site.xml"
      for { os <- confFile.newOutputStream.autoClosed } fs.getConf.writeXml(os)
    })

    writeDf()
  }

  override def afterAll(): Unit = {
    miniCluster.foreach(_.shutdown(true))
    val _ = testDataPath.parent.parent.delete(true)
    sparkSession.stop()
  }

  
  private def writeDf(): Unit = {
    import sparkSession.implicits._

    alogger.info(s"TestDataPath ${testDataPath.toJava.getAbsolutePath}")
    alogger.info(s"ConfPath ${confPath.toJava.getAbsolutePath}")
    val persons = (1 to 10).map(i => Person(s"Andy$i", Random.nextInt(85), Address("Via Ciccio Cappuccio")))
    val caseClassDS = persons.toDS()
    caseClassDS.write.format("parquet").mode(SaveMode.Overwrite).save(pathParquet)
    caseClassDS.write.format("com.databricks.spark.avro").mode(SaveMode.Overwrite).save(pathAvro)
    //writing directly the Person dataframe generates an exception
    caseClassDS.toDF.select("name", "age").write.format("csv").mode(SaveMode.Overwrite).option("header", "true").save(pathCsv)
  }
}

object DataFrameClasses {

  final case class Address(street: String)

  final case class Person(name: String, age: Int, address: Address)
}

Example 19

Source File: Target.scala From almaren-framework with Apache License 2.0

5 votes

package com.github.music.of.the.ainur.almaren.state.core

import com.github.music.of.the.ainur.almaren.State
import com.github.music.of.the.ainur.almaren.util.Constants
import org.apache.spark.sql.{DataFrame, SaveMode}

private[almaren] abstract class Target extends State {
  override def executor(df: DataFrame): DataFrame = target(df)

  def target(df: DataFrame): DataFrame
}

case class TargetSql(sql: String) extends Target {
  override def target(df: DataFrame): DataFrame = {
    logger.info(s"sql:{$sql}")
    df.createOrReplaceTempView(Constants.TempTableName)
    val sqlDf = df.sqlContext.sql(sql)
    df
  }
}

case class TargetJdbc(url: String, driver: String, dbtable: String, user: Option[String], password: Option[String], saveMode: SaveMode, params: Map[String, String]) extends Target {
  override def target(df: DataFrame): DataFrame = {
    logger.info(s"url:{$url}, driver:{$driver}, dbtable:{$dbtable}, user:{$user}, params:{$params}")

    val options = (user, password) match {
      case (Some(user), None) => params + ("user" -> user)
      case (Some(user), Some(password)) => params + ("user" -> user, "password" -> password)
      case (_, _) => params
    }

    df.write.format("jdbc")
      .option("url", url)
      .option("driver", driver)
      .option("dbtable", dbtable)
      .options(options)
      .mode(saveMode)
      .save()
    df
  }
}

case class TargetKafka(servers: String, options: Map[String, String]) extends Target {
  override def target(df: DataFrame): DataFrame = {
    logger.info(s"options: $options")
    df.write
      .format("kafka")
      .option("kafka.bootstrap.servers", servers)
      .options(options)
      .save()
    df
  }
}

case class TargetFile(
                       format: String,
                       path: String,
                       params: Map[String, String],
                       saveMode: SaveMode) extends Target {
  override def target(df: DataFrame): DataFrame = {
    logger.info(s"format:{$format}, path:{$path}, params:{$params}")
    df.write
      .format(format)
      .options(params)
      .save()
    df
  }
}

Example 20

Source File: DefaultSource.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.spark.excel

import org.apache.hadoop.fs.Path
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider {

  
  override def createRelation(
    sqlContext: SQLContext,
    parameters: Map[String, String],
    schema: StructType
  ): ExcelRelation = {
    ExcelRelation(
      location = checkParameter(parameters, "path"),
      sheetName = parameters.get("sheetName"),
      useHeader = checkParameter(parameters, "useHeader").toBoolean,
      treatEmptyValuesAsNulls = parameters.get("treatEmptyValuesAsNulls").fold(true)(_.toBoolean),
      userSchema = Option(schema),
      inferSheetSchema = parameters.get("inferSchema").fold(false)(_.toBoolean),
      addColorColumns = parameters.get("addColorColumns").fold(false)(_.toBoolean),
      startColumn = parameters.get("startColumn").fold(0)(_.toInt),
      endColumn = parameters.get("endColumn").fold(Int.MaxValue)(_.toInt),
      timestampFormat = parameters.get("timestampFormat"),
      maxRowsInMemory = parameters.get("maxRowsInMemory").map(_.toInt),
      excerptSize = parameters.get("excerptSize").fold(10)(_.toInt),
      parameters = parameters,
      dateFormat = parameters.get("dateFormats").getOrElse("yyyy-MM-dd").split(";").toList
    )(sqlContext)
  }

  override def createRelation(
    sqlContext: SQLContext,
    mode: SaveMode,
    parameters: Map[String, String],
    data: DataFrame
  ): BaseRelation = {
    val path = checkParameter(parameters, "path")
    val sheetName = parameters.getOrElse("sheetName", "Sheet1")
    val useHeader = checkParameter(parameters, "useHeader").toBoolean
    val dateFormat = parameters.getOrElse("dateFormat", ExcelFileSaver.DEFAULT_DATE_FORMAT)
    val timestampFormat = parameters.getOrElse("timestampFormat", ExcelFileSaver.DEFAULT_TIMESTAMP_FORMAT)
    val filesystemPath = new Path(path)
    val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
    fs.setWriteChecksum(false)
    val doSave = if (fs.exists(filesystemPath)) {
      mode match {
        case SaveMode.Append =>
          sys.error(s"Append mode is not supported by ${this.getClass.getCanonicalName}")
        case SaveMode.Overwrite =>
          fs.delete(filesystemPath, true)
          true
        case SaveMode.ErrorIfExists =>
          sys.error(s"path $path already exists.")
        case SaveMode.Ignore => false
      }
    } else {
      true
    }
    if (doSave) {
      // Only save data when the save mode is not ignore.
      (new ExcelFileSaver(fs)).save(
        filesystemPath,
        data,
        sheetName = sheetName,
        useHeader = useHeader,
        dateFormat = dateFormat,
        timestampFormat = timestampFormat
      )
    }

    createRelation(sqlContext, parameters, data.schema)
  }

  // Forces a Parameter to exist, otherwise an exception is thrown.
  private def checkParameter(map: Map[String, String], param: String): String = {
    if (!map.contains(param)) {
      throw new IllegalArgumentException(s"Parameter ${'"'}$param${'"'} is missing in options.")
    } else {
      map.apply(param)
    }
  }

  // Gets the Parameter if it exists, otherwise returns the default argument
  private def parameterOrDefault(map: Map[String, String], param: String, default: String) =
    map.getOrElse(param, default)
}

Example 21

Source File: DefaultSource.scala From spark-redis with BSD 3-Clause "New" or "Revised" License

5 votes

package org.apache.spark.sql.redis

import org.apache.spark.sql.SaveMode.{Append, ErrorIfExists, Ignore, Overwrite}
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

class DefaultSource extends RelationProvider with SchemaRelationProvider
  with CreatableRelationProvider {

  override def createRelation(sqlContext: SQLContext,
                              parameters: Map[String, String]): BaseRelation = {
    new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = None)
  }

  
  override def createRelation(sqlContext: SQLContext, mode: SaveMode,
                              parameters: Map[String, String], data: DataFrame): BaseRelation = {
    val relation = new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = None)
    mode match {
      case Append => relation.insert(data, overwrite = false)
      case Overwrite => relation.insert(data, overwrite = true)
      case ErrorIfExists =>
        if (relation.nonEmpty) {
          throw new IllegalStateException("SaveMode is set to ErrorIfExists and dataframe " +
            "already exists in Redis and contains data.")
        }
        relation.insert(data, overwrite = false)
      case Ignore =>
        if (relation.isEmpty) {
          relation.insert(data, overwrite = false)
        }
    }

    relation
  }

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String],
                              schema: StructType): BaseRelation =
    new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = Some(schema))
}

Example 22

Source File: DefaultSource.scala From spark-xml with Apache License 2.0

5 votes

package com.databricks.spark.xml

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
import com.databricks.spark.xml.util.XmlFile


  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String],
      schema: StructType): XmlRelation = {
    val path = checkPath(parameters)
    // We need the `charset` and `rowTag` before creating the relation.
    val (charset, rowTag) = {
      val options = XmlOptions(parameters)
      (options.charset, options.rowTag)
    }

    XmlRelation(
      () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag),
      Some(path),
      parameters,
      schema)(sqlContext)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      data: DataFrame): BaseRelation = {
    val path = checkPath(parameters)
    val filesystemPath = new Path(path)
    val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
    val doSave = if (fs.exists(filesystemPath)) {
      mode match {
        case SaveMode.Append =>
          throw new IllegalArgumentException(
            s"Append mode is not supported by ${this.getClass.getCanonicalName}")
        case SaveMode.Overwrite =>
          fs.delete(filesystemPath, true)
          true
        case SaveMode.ErrorIfExists =>
          throw new IllegalArgumentException(s"path $path already exists.")
        case SaveMode.Ignore => false
      }
    } else {
      true
    }
    if (doSave) {
      // Only save data when the save mode is not ignore.
      XmlFile.saveAsXmlFile(data, filesystemPath.toString, parameters)
    }
    createRelation(sqlContext, parameters, data.schema)
  }
}

Example 23

Source File: DataGenerator.scala From iterative-broadcast-join with Apache License 2.0

5 votes

package com.godatadriven.generator

import com.godatadriven.common.Config
import com.godatadriven.generator.UniformDataGenerator.KeyLabel
import org.apache.spark.sql.{SaveMode, SparkSession}

import scala.util.Random

trait DataGenerator {


  def numberOfRows(numberOfKeys: Int = Config.numberOfKeys,
                   keysMultiplier: Int = Config.keysMultiplier): Long =
    generateSkewedSequence(numberOfKeys).map(_._2).sum * keysMultiplier.toLong

  
  def generateSkewedSequence(numberOfKeys: Int): List[(Int, Int)] =
    (0 to numberOfKeys).par.map(i =>
      (i, Math.ceil(
        (numberOfKeys.toDouble - i.toDouble) / (i.toDouble + 1.0)
      ).toInt)
    ).toList

  def createMediumTable(spark: SparkSession, tableName: String, numberOfPartitions: Int): Unit = {

    import spark.implicits._

    val df = spark
      .read
      .parquet("table_large.parquet")
      .as[Int]
      .distinct()
      .mapPartitions(rows => {
        val r = new Random()
        rows.map(key =>
          KeyLabel(
            key,
            s"Description for entry $key, that can be anything",
            // Already preallocate the pass of the broadcast iteration here
            Math.floor(r.nextDouble() * Config.numberOfBroadcastPasses).toInt
          )
        )
      })
      .repartition(numberOfPartitions)

    assert(df.count() == Config.numberOfKeys)

    df
      .write
      .mode(SaveMode.Overwrite)
      .parquet(tableName)
  }

  def buildTestset(spark: SparkSession,
                   numberOfKeys: Int = Config.numberOfKeys,
                   keysMultiplier: Int = Config.keysMultiplier,
                   numberOfPartitions: Int = Config.numberOfPartitions): Unit

  def getName: String

  def getMediumTableName: String

  def getLargeTableName: String

}

Example 24

Source File: SparkUtil.scala From iterative-broadcast-join with Apache License 2.0

5 votes

package com.godatadriven

import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}

object SparkUtil {

  def dfWrite(df: DataFrame, name: String): Unit =
    df
      .write
      .mode(SaveMode.Overwrite)
      .parquet(name)

  def dfRead(spark: SparkSession, name: String): DataFrame =
    spark
      .read
      .load(name)

}

Example 25

Source File: CreateHiveTableAsSelectCommand.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import scala.util.control.NonFatal

import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession}
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.command.DataWritingCommand



case class CreateHiveTableAsSelectCommand(
    tableDesc: CatalogTable,
    query: LogicalPlan,
    outputColumnNames: Seq[String],
    mode: SaveMode)
  extends DataWritingCommand {

  private val tableIdentifier = tableDesc.identifier

  override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
    val catalog = sparkSession.sessionState.catalog
    if (catalog.tableExists(tableIdentifier)) {
      assert(mode != SaveMode.Overwrite,
        s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite")

      if (mode == SaveMode.ErrorIfExists) {
        throw new AnalysisException(s"$tableIdentifier already exists.")
      }
      if (mode == SaveMode.Ignore) {
        // Since the table already exists and the save mode is Ignore, we will just return.
        return Seq.empty
      }

      // For CTAS, there is no static partition values to insert.
      val partition = tableDesc.partitionColumnNames.map(_ -> None).toMap
      InsertIntoHiveTable(
        tableDesc,
        partition,
        query,
        overwrite = false,
        ifPartitionNotExists = false,
        outputColumnNames = outputColumnNames).run(sparkSession, child)
    } else {
      // TODO ideally, we should get the output data ready first and then
      // add the relation into catalog, just in case of failure occurs while data
      // processing.
      assert(tableDesc.schema.isEmpty)
      catalog.createTable(
        tableDesc.copy(schema = outputColumns.toStructType), ignoreIfExists = false)

      try {
        // Read back the metadata of the table which was created just now.
        val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier)
        // For CTAS, there is no static partition values to insert.
        val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap
        InsertIntoHiveTable(
          createdTableMeta,
          partition,
          query,
          overwrite = true,
          ifPartitionNotExists = false,
          outputColumnNames = outputColumnNames).run(sparkSession, child)
      } catch {
        case NonFatal(e) =>
          // drop the created table.
          catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false)
          throw e
      }
    }

    Seq.empty[Row]
  }

  override def argString: String = {
    s"[Database:${tableDesc.database}, " +
    s"TableName: ${tableDesc.identifier.table}, " +
    s"InsertIntoHiveTable]"
  }
}

Example 26

Source File: XSQLCreateHiveTableAsSelectCommand.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.xsql.execution.command

import scala.util.control.NonFatal

import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession}
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.command.DataWritingCommand
import org.apache.spark.sql.xsql.XSQLSessionCatalog


case class XSQLCreateHiveTableAsSelectCommand(
    tableDesc: CatalogTable,
    query: LogicalPlan,
    outputColumnNames: Seq[String],
    mode: SaveMode)
  extends DataWritingCommand {

  override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
    val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog]
    val tableIdentifier = catalog.getUsedTableIdentifier(tableDesc.identifier)
    val newTableDesc = tableDesc.copy(identifier = tableIdentifier)
    if (catalog.tableExists(tableIdentifier)) {
      assert(
        mode != SaveMode.Overwrite,
        s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite")

      if (mode == SaveMode.ErrorIfExists) {
        throw new AnalysisException(s"$tableIdentifier already exists.")
      }
      if (mode == SaveMode.Ignore) {
        // Since the table already exists and the save mode is Ignore, we will just return.
        return Seq.empty
      }

      XSQLInsertIntoHiveTable(
        newTableDesc,
        Map.empty,
        query,
        overwrite = false,
        ifPartitionNotExists = false,
        outputColumnNames = outputColumnNames).run(sparkSession, child)
    } else {
      // TODO ideally, we should get the output data ready first and then
      // add the relation into catalog, just in case of failure occurs while data
      // processing.
      assert(newTableDesc.schema.isEmpty)
      catalog.createTable(newTableDesc.copy(schema = query.schema), ignoreIfExists = false)

      try {
        // Read back the metadata of the table which was created just now.
        val createdTableMeta = catalog.getTableMetadata(newTableDesc.identifier)
        // For CTAS, there is no static partition values to insert.
        val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap
        XSQLInsertIntoHiveTable(
          createdTableMeta,
          partition,
          query,
          overwrite = true,
          ifPartitionNotExists = false,
          outputColumnNames = outputColumnNames).run(sparkSession, child)
      } catch {
        case NonFatal(e) =>
          // drop the created table.
          catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false)
          throw e
      }
    }

    Seq.empty[Row]
  }

  override def argString: String = {
    s"[TableName: ${tableDesc.identifier.table}, " +
      s"InsertIntoHiveTable]"
  }
}

Example 27

Source File: JdbcRelationProvider.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val resolver = sqlContext.conf.resolver
    val timeZoneId = sqlContext.conf.sessionLocalTimeZone
    val schema = JDBCRelation.getSchema(resolver, jdbcOptions)
    val parts = JDBCRelation.columnPartition(schema, resolver, timeZoneId, jdbcOptions)
    JDBCRelation(schema, parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val options = new JdbcOptionsInWrite(parameters)
    val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis

    val conn = JdbcUtils.createConnectionFactory(options)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, options)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, options)
              val tableSchema = JdbcUtils.getSchemaOption(conn, options)
              saveTable(df, tableSchema, isCaseSensitive, options)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, options.table, options)
              createTable(conn, df, options)
              saveTable(df, Some(df.schema), isCaseSensitive, options)
            }

          case SaveMode.Append =>
            val tableSchema = JdbcUtils.getSchemaOption(conn, options)
            saveTable(df, tableSchema, isCaseSensitive, options)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '${options.table}' already exists. " +
                s"SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(conn, df, options)
        saveTable(df, Some(df.schema), isCaseSensitive, options)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
}

Example 28

Source File: SaveIntoDataSourceCommand.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
import org.apache.spark.sql.catalyst.plans.QueryPlan
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.sources.CreatableRelationProvider


case class SaveIntoDataSourceCommand(
    query: LogicalPlan,
    dataSource: CreatableRelationProvider,
    options: Map[String, String],
    mode: SaveMode) extends RunnableCommand {

  override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query)

  override def run(sparkSession: SparkSession): Seq[Row] = {
    dataSource.createRelation(
      sparkSession.sqlContext, mode, options, Dataset.ofRows(sparkSession, query))

    Seq.empty[Row]
  }

  override def simpleString: String = {
    val redacted = SQLConf.get.redactOptions(options)
    s"SaveIntoDataSourceCommand ${dataSource}, ${redacted}, ${mode}"
  }
}

Example 29

Source File: SaveIntoDataSourceCommandSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.SparkConf
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.test.SharedSQLContext

class SaveIntoDataSourceCommandSuite extends SharedSQLContext {

  test("simpleString is redacted") {
    val URL = "connection.url"
    val PASS = "mypassword"
    val DRIVER = "mydriver"

    val dataSource = DataSource(
      sparkSession = spark,
      className = "jdbc",
      partitionColumns = Nil,
      options = Map("password" -> PASS, "url" -> URL, "driver" -> DRIVER))

    val logicalPlanString = dataSource
      .planForWriting(SaveMode.ErrorIfExists, spark.range(1).logicalPlan)
      .treeString(true)

    assert(!logicalPlanString.contains(URL))
    assert(!logicalPlanString.contains(PASS))
    assert(logicalPlanString.contains(DRIVER))
  }
}

Example 30

Source File: DefaultSource.scala From spark-power-bi with Apache License 2.0

5 votes

package com.granturing.spark.powerbi

import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider}
import scala.concurrent._
import scala.concurrent.ExecutionContext.Implicits._
import scala.concurrent.duration.Duration

class DefaultSource extends CreatableRelationProvider with PowerBISink {

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      data: DataFrame): BaseRelation = {

    val conf = ClientConf.fromSparkConf(sqlContext.sparkContext.getConf)
    implicit val client = new Client(conf)

    val dataset = parameters.getOrElse("dataset", sys.error("'dataset' must be specified"))
    val table = parameters.getOrElse("table", sys.error("'table' must be specified"))
    val batchSize = parameters.getOrElse("batchSize", conf.batchSize.toString).toInt
    val group = parameters.get("group")

    val step = for {
      groupId <- getGroupId(group)
      ds <- getOrCreateDataset(mode, groupId, dataset, table, data.schema)
    } yield (groupId, ds)

    val result = step map { case (groupId, ds) =>
      val fields = data.schema.fieldNames.zipWithIndex
      val _conf = conf
      val _token = Some(client.currentToken)
      val _table = table
      val _batchSize = batchSize

      val coalesced = data.rdd.partitions.size > _conf.maxPartitions match {
        case true => data.coalesce(_conf.maxPartitions)
        case false => data
      }

      coalesced foreachPartition { p =>
        val rows = p map { r =>
          fields map { case(name, index) => (name -> r(index)) } toMap
        } toSeq

        val _client = new Client(_conf, _token)

        val submit = rows.
          sliding(_batchSize, _batchSize).
          foldLeft(future()) { (fAccum, batch) =>
          fAccum flatMap { _ => _client.addRows(ds.id, _table, batch, groupId) } }

        submit.onComplete { _ => _client.shutdown() }

        Await.result(submit, _conf.timeout)
      }
    }

    result.onComplete { _ => client.shutdown() }

    Await.result(result, Duration.Inf)

    new BaseRelation {
      val sqlContext = data.sqlContext

      val schema = data.schema
    }
  }

}

Example 31

Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0

5 votes

package com.datastax.spark.example

import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._

// For DSE it is not necessary to set connection parameters for spark.master (since it will be done
// automatically)
object WriteRead extends App {

  val spark = SparkSession.builder
    .appName("Datastax Scala example")
    .enableHiveSupport()
    .getOrCreate()

  import spark.implicits._

  // Create keyspace and table
  CassandraConnector(spark.sparkContext).withSessionDo { session =>
    session.execute(
      """CREATE KEYSPACE IF NOT EXISTS ks WITH
        | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin)
    session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""")
  }

  // Write some data
  spark.range(1, 10)
    .map(x => (x, x))
    .rdd
    .saveToCassandra("ks", "kv")

  // Read data as RDD
  val rdd = spark.sparkContext
    .cassandraTable(keyspace = "ks", table = "kv")

  // Read data as DataSet (DataFrame)
  val dataset = spark.read
    .cassandraFormat(keyspace = "ks", table = "kv")
    .load()

  println("Data read as RDD")
  rdd.collect()
    .foreach(println)

  println("Data read as DataSet (DataFrame)")
  dataset.collect()
    .foreach(println)

  spark.stop()
  sys.exit(0)
}

Example 32

Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0

5 votes

package com.datastax.spark.example

import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._

// For DSE it is not necessary to set connection parameters for spark.master (since it will be done
// automatically)
object WriteRead extends App {

  val spark = SparkSession.builder
    .appName("Datastax Scala example")
    .enableHiveSupport()
    .getOrCreate()

  import spark.implicits._

  // Create keyspace and table
  CassandraConnector(spark.sparkContext).withSessionDo { session =>
    session.execute(
      """CREATE KEYSPACE IF NOT EXISTS ks WITH
        | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin)
    session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""")
  }

  // Write some data
  spark.range(1, 10)
    .map(x => (x, x))
    .rdd
    .saveToCassandra("ks", "kv")

  // Read data as RDD
  val rdd = spark.sparkContext
    .cassandraTable(keyspace = "ks", table = "kv")

  // Read data as DataSet (DataFrame)
  val dataset = spark.read
    .cassandraFormat(keyspace = "ks", table = "kv")
    .load()

  println("Data read as RDD")
  rdd.collect()
    .foreach(println)

  println("Data read as DataSet (DataFrame)")
  dataset.collect()
    .foreach(println)

  spark.stop()
  sys.exit(0)
}

Example 33

Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0

5 votes

package com.datastax.spark.example

import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._

// For DSE it is not necessary to set connection parameters for spark.master (since it will be done
// automatically)
object WriteRead extends App {

  val spark = SparkSession.builder
    .appName("Datastax Scala example")
    .enableHiveSupport()
    .getOrCreate()

  import spark.implicits._

  // Create keyspace and table
  CassandraConnector(spark.sparkContext).withSessionDo { session =>
    session.execute(
      """CREATE KEYSPACE IF NOT EXISTS ks WITH
        | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin)
    session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""")
  }

  // Write some data
  spark.range(1, 10)
    .map(x => (x, x))
    .rdd
    .saveToCassandra("ks", "kv")

  // Read data as RDD
  val rdd = spark.sparkContext
    .cassandraTable(keyspace = "ks", table = "kv")

  // Read data as DataSet (DataFrame)
  val dataset = spark.read
    .cassandraFormat(keyspace = "ks", table = "kv")
    .load()

  println("Data read as RDD")
  rdd.collect()
    .foreach(println)

  println("Data read as DataSet (DataFrame)")
  dataset.collect()
    .foreach(println)

  spark.stop()
  sys.exit(0)
}

Example 34

Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0

5 votes

package com.datastax.spark.example

import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._

// For DSE it is not necessary to set connection parameters for spark.master (since it will be done
// automatically)
object WriteRead extends App {

  val spark = SparkSession.builder
    .appName("Datastax Scala example")
    .enableHiveSupport()
    .getOrCreate()

  import spark.implicits._

  // Create keyspace and table
  CassandraConnector(spark.sparkContext).withSessionDo { session =>
    session.execute(
      """CREATE KEYSPACE IF NOT EXISTS ks WITH
        | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin)
    session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""")
  }

  // Write some data
  spark.range(1, 10)
    .map(x => (x, x))
    .rdd
    .saveToCassandra("ks", "kv")

  // Read data as RDD
  val rdd = spark.sparkContext
    .cassandraTable(keyspace = "ks", table = "kv")

  // Read data as DataSet (DataFrame)
  val dataset = spark.read
    .cassandraFormat(keyspace = "ks", table = "kv")
    .load()

  println("Data read as RDD")
  rdd.collect()
    .foreach(println)

  println("Data read as DataSet (DataFrame)")
  dataset.collect()
    .foreach(println)

  spark.stop()
  sys.exit(0)
}

Example 35

Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0

5 votes

package com.datastax.spark.example

import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._

// For DSE it is not necessary to set connection parameters for spark.master (since it will be done
// automatically)
object WriteRead extends App {

  val spark = SparkSession.builder
    .appName("Datastax Scala example")
    .enableHiveSupport()
    .getOrCreate()

  import spark.implicits._

  // Create keyspace and table
  CassandraConnector(spark.sparkContext).withSessionDo { session =>
    session.execute(
      """CREATE KEYSPACE IF NOT EXISTS ks WITH
        | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin)
    session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""")
  }

  // Write some data
  spark.range(1, 10)
    .map(x => (x, x))
    .rdd
    .saveToCassandra("ks", "kv")

  // Read data as RDD
  val rdd = spark.sparkContext
    .cassandraTable(keyspace = "ks", table = "kv")

  // Read data as DataSet (DataFrame)
  val dataset = spark.read
    .cassandraFormat(keyspace = "ks", table = "kv")
    .load()

  println("Data read as RDD")
  rdd.collect()
    .foreach(println)

  println("Data read as DataSet (DataFrame)")
  dataset.collect()
    .foreach(println)

  spark.stop()
  sys.exit(0)
}

Example 36

Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0

5 votes

package com.datastax.spark.example

import com.datastax.spark.connector._
import com.datastax.spark.connector.cql.CassandraConnector
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.cassandra._

// For DSE it is not necessary to set connection parameters for spark.master (since it will be done
// automatically)
object WriteRead extends App {

  val spark = SparkSession.builder
    .appName("Datastax Scala example")
    .enableHiveSupport()
    .getOrCreate()

  import spark.implicits._

  // Create keyspace and table
  CassandraConnector(spark.sparkContext).withSessionDo { session =>
    session.execute(
      """CREATE KEYSPACE IF NOT EXISTS ks WITH
        | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin)
    session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""")
  }

  // Write some data
  spark.range(1, 10)
    .map(x => (x, x))
    .rdd
    .saveToCassandra("ks", "kv")

  // Read data as RDD
  val rdd = spark.sparkContext
    .cassandraTable(keyspace = "ks", table = "kv")

  // Read data as DataSet (DataFrame)
  val dataset = spark.read
    .cassandraFormat(keyspace = "ks", table = "kv")
    .load()

  println("Data read as RDD")
  rdd.collect()
    .foreach(println)

  println("Data read as DataSet (DataFrame)")
  dataset.collect()
    .foreach(println)

  spark.stop()
  sys.exit(0)
}

Example 37

Source File: T8-5-L8-30-34DataFrameExamplesActions.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apress.prospark.CdrDataframeExamplesActionsApp.Cdr
import org.json4s.DefaultFormats

object CdrDataframeExamplesActionsApp {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: CdrDataframeExamplesActionsApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val cl = Thread.currentThread().getContextClassLoader()
    val hiveC = new HiveContext(ssc.sparkContext)
    Thread.currentThread().setContextClassLoader(cl)
    import hiveC.implicits._
    implicit val formats = DefaultFormats

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD(rdd => {
        val cdrs = seqToCdr(rdd).toDF()

        val counts = cdrs.groupBy("countryCode").count().orderBy(desc("count"))
        counts.show(5)
        counts.show()
        println("head(5): " + counts.head(5))
        println("take(5): " + counts.take(5))
        println("head(): " + counts.head())
        println("first(5): " + counts.first())
        println("count(): " + counts.count())
        println("collect(): " + counts.collect())
        println("collectAsList(): " + counts.collectAsList())
        println("describe(): " + cdrs.describe("smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity").show())
        counts.write.format("parquet").save("/tmp/parquent" + rdd.id)
        counts.write.format("json").save("/tmp/json" + rdd.id)
        counts.write.parquet("/tmp/parquent2" + rdd.id)
        counts.write.json("/tmp/json2" + rdd.id)
        counts.write.saveAsTable("count_table")
        cdrs.groupBy("countryCode").count().orderBy(desc("count")).write.mode(SaveMode.Append).save("/tmp/counts")
        val prop: java.util.Properties = new java.util.Properties()
        counts.write.jdbc("jdbc:mysql://hostname:port/cdrsdb", "count_table", prop)
      })

    ssc.start()
    ssc.awaitTermination()
  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
}

Example 38

Source File: DefaultSource.scala From spark-excel with Apache License 2.0

5 votes

package com.crealytics.spark.excel

import com.crealytics.spark.excel.Utils._
import org.apache.hadoop.fs.Path
import org.apache.poi.ss.util.{CellRangeAddress, CellReference}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

import scala.util.Try

class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider {

  
  override def createRelation(
    sqlContext: SQLContext,
    parameters: Map[String, String],
    schema: StructType
  ): ExcelRelation = {
    val wbReader = WorkbookReader(parameters, sqlContext.sparkContext.hadoopConfiguration)
    val dataLocator = DataLocator(parameters)
    ExcelRelation(
      header = checkParameter(parameters, "header").toBoolean,
      treatEmptyValuesAsNulls = parameters.get("treatEmptyValuesAsNulls").fold(false)(_.toBoolean),
      userSchema = Option(schema),
      inferSheetSchema = parameters.get("inferSchema").fold(false)(_.toBoolean),
      addColorColumns = parameters.get("addColorColumns").fold(false)(_.toBoolean),
      timestampFormat = parameters.get("timestampFormat"),
      excerptSize = parameters.get("excerptSize").fold(10)(_.toInt),
      dataLocator = dataLocator,
      workbookReader = wbReader
    )(sqlContext)
  }

  override def createRelation(
    sqlContext: SQLContext,
    mode: SaveMode,
    parameters: Map[String, String],
    data: DataFrame
  ): BaseRelation = {
    val path = checkParameter(parameters, "path")
    val header = checkParameter(parameters, "header").toBoolean
    val filesystemPath = new Path(path)
    val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
    new ExcelFileSaver(
      fs,
      filesystemPath,
      data,
      saveMode = mode,
      header = header,
      dataLocator = DataLocator(parameters)
    ).save()

    createRelation(sqlContext, parameters, data.schema)
  }

  // Forces a Parameter to exist, otherwise an exception is thrown.
  private def checkParameter(map: Map[String, String], param: String): String = {
    if (!map.contains(param)) {
      throw new IllegalArgumentException(s"Parameter ${'"'}$param${'"'} is missing in options.")
    } else {
      map.apply(param)
    }
  }
}

Example 39

Source File: ExcelFileSaver.scala From spark-excel with Apache License 2.0

5 votes

package com.crealytics.spark.excel

import java.io.BufferedOutputStream

import com.crealytics.spark.excel.ExcelFileSaver.{DEFAULT_DATE_FORMAT, DEFAULT_SHEET_NAME, DEFAULT_TIMESTAMP_FORMAT}
import com.norbitltd.spoiwo.model._
import com.norbitltd.spoiwo.natures.xlsx.Model2XlsxConversions._
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.poi.ss.util.CellRangeAddress
import org.apache.poi.xssf.usermodel.XSSFWorkbook
import org.apache.spark.sql.{DataFrame, SaveMode}

import scala.collection.JavaConverters._

object ExcelFileSaver {
  final val DEFAULT_SHEET_NAME = "Sheet1"
  final val DEFAULT_DATE_FORMAT = "yy-m-d h:mm"
  final val DEFAULT_TIMESTAMP_FORMAT = "yyyy-mm-dd hh:mm:ss.000"
}

class ExcelFileSaver(
  fs: FileSystem,
  location: Path,
  dataFrame: DataFrame,
  saveMode: SaveMode,
  dataLocator: DataLocator,
  header: Boolean = true
) {
  def save(): Unit = {
    def sheet(workbook: XSSFWorkbook) = {
      val headerRow = if (header) Some(dataFrame.schema.fields.map(_.name).toSeq) else None
      val dataRows = dataFrame
        .toLocalIterator()
        .asScala
        .map(_.toSeq)
      dataLocator.toSheet(headerRow, dataRows, workbook)
    }
    val fileAlreadyExists = fs.exists(location)
    def writeToWorkbook(workbook: XSSFWorkbook): Unit = {
      Workbook(sheet(workbook)).writeToExisting(workbook)
      autoClose(new BufferedOutputStream(fs.create(location)))(workbook.write)
    }
    (fileAlreadyExists, saveMode) match {
      case (false, _) | (_, SaveMode.Overwrite) =>
        if (fileAlreadyExists) {
          fs.delete(location, true)
        }
        writeToWorkbook(new XSSFWorkbook())
      case (true, SaveMode.ErrorIfExists) =>
        sys.error(s"path $location already exists.")
      case (true, SaveMode.Ignore) => ()
      case (true, SaveMode.Append) =>
        val inputStream: FSDataInputStream = fs.open(location)
        val workbook = new XSSFWorkbook(inputStream)
        inputStream.close()
        writeToWorkbook(workbook)
    }
  }

  def autoClose[A <: AutoCloseable, B](closeable: A)(fun: (A) => B): B = {
    try {
      fun(closeable)
    } finally {
      closeable.close()
    }
  }
}

Example 40

Source File: Output.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.sdk.pipeline.output

import java.io.{Serializable => JSerializable}

import akka.event.slf4j.SLF4JLogging
import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._
import com.stratio.sparta.sdk.properties.{CustomProperties, Parameterizable}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row, SaveMode}

abstract class Output(val name: String, properties: Map[String, JSerializable])
  extends Parameterizable(properties) with SLF4JLogging with CustomProperties {

  val customKey = "saveOptions"
  val customPropertyKey = "saveOptionsKey"
  val customPropertyValue = "saveOptionsValue"
  val propertiesWithCustom = properties ++ getCustomProperties

  def setUp(options: Map[String, String] = Map.empty[String, String]): Unit = {}

  def cleanUp(options: Map[String, String] = Map.empty[String, String]): Unit = {}

  def save(dataFrame: DataFrame, saveMode: SaveModeEnum.Value, options: Map[String, String]): Unit

  def supportedSaveModes: Seq[SaveModeEnum.Value] = SaveModeEnum.allSaveModes

  def validateSaveMode(saveMode: SaveModeEnum.Value): Unit = {
    if (!supportedSaveModes.contains(saveMode))
      log.info(s"Save mode $saveMode selected not supported by the output $name." +
        s" Using the default mode ${SaveModeEnum.Append}"
      )
  }
}

object Output extends SLF4JLogging {

  final val ClassSuffix = "Output"
  final val SparkConfigurationMethod = "getSparkConfiguration"
  final val Separator = "_"
  final val FieldsSeparator = ","
  final val PrimaryKey = "primaryKey"
  final val TableNameKey = "tableName"
  final val PartitionByKey = "partitionBy"
  final val TimeDimensionKey = "timeDimension"
  final val MeasureMetadataKey = "measure"
  final val PrimaryKeyMetadataKey = "pk"

  def getSparkSaveMode(saveModeEnum: SaveModeEnum.Value): SaveMode =
    saveModeEnum match {
      case SaveModeEnum.Append => SaveMode.Append
      case SaveModeEnum.ErrorIfExists => SaveMode.ErrorIfExists
      case SaveModeEnum.Overwrite => SaveMode.Overwrite
      case SaveModeEnum.Ignore => SaveMode.Ignore
      case SaveModeEnum.Upsert => SaveMode.Append
      case _ =>
        log.warn(s"Save Mode $saveModeEnum not supported, using default save mode ${SaveModeEnum.Append}")
        SaveMode.Append
    }

  def getTimeFromOptions(options: Map[String, String]): Option[String] = options.get(TimeDimensionKey).notBlank

  def getPrimaryKeyOptions(options: Map[String, String]): Option[String] = options.get(PrimaryKey).notBlank

  def getTableNameFromOptions(options: Map[String, String]): String =
    options.getOrElse(TableNameKey, {
      log.error("Table name not defined")
      throw new NoSuchElementException("tableName not found in options")
    })

  def applyPartitionBy(options: Map[String, String],
                       dataFrame: DataFrameWriter[Row],
                       schemaFields: Array[StructField]): DataFrameWriter[Row] = {

    options.get(PartitionByKey).notBlank.fold(dataFrame)(partitions => {
      val fieldsInDataFrame = schemaFields.map(field => field.name)
      val partitionFields = partitions.split(",")
      if (partitionFields.forall(field => fieldsInDataFrame.contains(field)))
        dataFrame.partitionBy(partitionFields: _*)
      else {
        log.warn(s"Impossible to execute partition by fields: $partitionFields because the dataFrame not contain all" +
          s" fields. The dataFrame only contains: ${fieldsInDataFrame.mkString(",")}")
        dataFrame
      }
    })
  }

  def defaultTimeStampField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField =
    StructField(fieldName, TimestampType, nullable, metadata)

  def defaultDateField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField =
    StructField(fieldName, DateType, nullable, metadata)

  def defaultStringField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField =
    StructField(fieldName, StringType, nullable, metadata)

  def defaultGeoField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField =
    StructField(fieldName, ArrayType(DoubleType), nullable, metadata)

  def defaultLongField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField =
    StructField(fieldName, LongType, nullable, metadata)
}

Example 41

Source File: CreateDataSourceTableAsSelectHarvesterSuite.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas.sql

import com.hortonworks.spark.atlas.types.metadata

import scala.util.Random
import com.hortonworks.spark.atlas.{SACAtlasEntityWithDependencies, WithHiveSupport}
import com.hortonworks.spark.atlas.utils.SparkUtils
import org.apache.atlas.model.instance.AtlasEntity
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType}
import org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand
import org.apache.spark.sql.execution.datasources.DataSource
import org.apache.spark.sql.types.StructType
import org.scalatest.{FunSuite, Matchers}

// This is not leveraging BaseHarvesterSuite, as it doesn't need to be tested with
// both non-remote HMS and remote HMS cases.
class CreateDataSourceTableAsSelectHarvesterSuite
    extends FunSuite with Matchers with WithHiveSupport {

  private val sourceTblName = "source_" + Random.nextInt(100000)

  override protected def beforeAll(): Unit = {
    super.beforeAll()

    sparkSession.sql(s"CREATE TABLE $sourceTblName (name string, age int)")
  }

  test("saveAsTable should have output entity having table details - parquet") {
    testWithProvider("parquet")
  }

  test("saveAsTable should have output entity having table details - hive") {
    val entity = testWithProvider("hive")
    assert(entity.getAttribute("partitionProvider") == "Catalog")
  }

  def testWithProvider(provider: String): AtlasEntity = {
    val destTblName = "dest1_" + Random.nextInt(100000)
    val df = sparkSession.sql(s"SELECT * FROM $sourceTblName")

    // The codes below look after DataFrameWriter.saveAsTable codes as of Spark 2.4.
    // It uses internal APIs for this test. If the compatibility is broken, we should better
    // just remove this test.
    val tableIdent = df.sparkSession.sessionState.sqlParser.parseTableIdentifier(destTblName)
    val storage = DataSource.buildStorageFormatFromOptions(Map("path" -> "/tmp/foo"))
    val tableDesc = CatalogTable(
      identifier = tableIdent,
      tableType = CatalogTableType.EXTERNAL,
      storage = storage,
      schema = new StructType,
      provider = Some(provider),
      partitionColumnNames = Nil,
      bucketSpec = None)
    val cmd = CreateDataSourceTableAsSelectCommand(
      tableDesc,
      SaveMode.ErrorIfExists,
      df.queryExecution.logical,
      Seq("name", "age"))
    val newTable = tableDesc.copy(
      storage = tableDesc.storage.copy(),
      schema = df.schema)
    sparkSession.sessionState.catalog.createTable(
      newTable, ignoreIfExists = false, validateLocation = false)

    val qd = QueryDetail(df.queryExecution, 0L)
    val entities = CommandsHarvester.CreateDataSourceTableAsSelectHarvester.harvest(cmd, qd)
    val processDeps = entities.head.asInstanceOf[SACAtlasEntityWithDependencies].dependencies
    val maybeEntity = processDeps.find(_.typeName == metadata.TABLE_TYPE_STRING)
      .map(_.asInstanceOf[SACAtlasEntityWithDependencies].entity)

    assert(maybeEntity.isDefined, s"Output entity for table [$destTblName] was not found.")
    assert(maybeEntity.get.getAttribute("name") == destTblName)
    assert(maybeEntity.get.getAttribute("owner") == SparkUtils.currUser())
    assert(maybeEntity.get.getAttribute("schemaDesc") == "struct<name:string,age:int>")
    assert(maybeEntity.get.getAttribute("provider") == provider)
    maybeEntity.get
  }
}

Example 42

Source File: LibSVMRelationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Example 43

Source File: JdbcRelationProvider.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val partitionColumn = jdbcOptions.partitionColumn
    val lowerBound = jdbcOptions.lowerBound
    val upperBound = jdbcOptions.upperBound
    val numPartitions = jdbcOptions.numPartitions

    val partitionInfo = if (partitionColumn == null) {
      null
    } else {
      JDBCPartitioningInfo(
        partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt)
    }
    val parts = JDBCRelation.columnPartition(partitionInfo)
    JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val url = jdbcOptions.url
    val table = jdbcOptions.table
    val createTableOptions = jdbcOptions.createTableOptions
    val isTruncate = jdbcOptions.isTruncate

    val conn = JdbcUtils.createConnectionFactory(jdbcOptions)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, url, table)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (isTruncate && isCascadingTruncateTable(url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, table)
              saveTable(df, url, table, jdbcOptions)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, table)
              createTable(df.schema, url, table, createTableOptions, conn)
              saveTable(df, url, table, jdbcOptions)
            }

          case SaveMode.Append =>
            saveTable(df, url, table, jdbcOptions)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '$table' already exists. SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(df.schema, url, table, createTableOptions, conn)
        saveTable(df, url, table, jdbcOptions)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
}

Example 44

Source File: DoubleDataTypeTestCase.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.integration.spark.testsuite.primitiveTypes

import java.util.Random

import org.apache.spark.sql.test.util.QueryTest
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SaveMode}
import org.scalatest.BeforeAndAfterAll


class DoubleDataTypeTestCase extends QueryTest with BeforeAndAfterAll {

  lazy val df: DataFrame = generateDataFrame

  private def generateDataFrame(): DataFrame = {
    val r = new Random()
    val rdd = sqlContext.sparkContext
      .parallelize(1 to 10, 2)
      .map { x =>
        Row(x, "London" + (x % 2), x.toDouble / 13, x.toDouble / 11)
      }

    val schema = StructType(
      Seq(
        StructField("id", IntegerType, nullable = false),
        StructField("city", StringType, nullable = false),
        StructField("m1", DoubleType, nullable = false),
        StructField("m2", DoubleType, nullable = false)
      )
    )

    sqlContext.createDataFrame(rdd, schema)
  }

  override def beforeAll {
    sql("drop table if exists uniq_carbon")
    sql("drop table if exists uniq_hive")
    sql("drop table if exists doubleTypeCarbonTable")
    sql("drop table if exists doubleTypeHiveTable")

    df.write
      .format("carbondata")
      .option("tableName", "doubleTypeCarbonTable")
      .option("tempCSV", "false")
      .option("table_blocksize", "32")
      .mode(SaveMode.Overwrite)
      .save()

    df.write
      .mode(SaveMode.Overwrite)
      .saveAsTable("doubleTypeHiveTable")

  }

  test("detail query") {
    checkAnswer(sql("select * from doubleTypeCarbonTable order by id"),
      sql("select * from doubleTypeHiveTable order by id"))

  }

  test("duplicate values") {
    sql("create table uniq_carbon(name string, double_column double) STORED AS carbondata ")
    sql(s"load data inpath '$resourcesPath/uniq.csv' into table uniq_carbon")
    sql("create table uniq_hive(name string, double_column double) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','")
    sql(s"load data local inpath '$resourcesPath/uniqwithoutheader.csv' into table uniq_hive")
    checkAnswer(sql("select * from uniq_carbon where double_column>=11"),
      sql("select * from uniq_hive where double_column>=11"))
  }

//  test("agg query") {
//    checkAnswer(sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeCarbonTable group by city"),
//      sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeHiveTable group by city"))
//
//    checkAnswer(sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeCarbonTable group by city"),
//      sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeHiveTable group by city"))
//  }

  override def afterAll {
    sql("drop table if exists uniq_carbon")
    sql("drop table if exists uniq_hive")
    sql("drop table if exists doubleTypeCarbonTable")
    sql("drop table if exists doubleTypeHiveTable")
  }
}

Example 45

Source File: UpdateCarbonTableTestCaseWithBadRecord.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.spark.testsuite.iud

import org.apache.spark.sql.{Row, SaveMode}
import org.scalatest.BeforeAndAfterAll
import org.apache.carbondata.common.constants.LoggerAction
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.spark.sql.test.util.QueryTest

class UpdateCarbonTableTestCaseWithBadRecord extends QueryTest with BeforeAndAfterAll {
  override def beforeAll {

    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION , LoggerAction.FORCE.name())
  }


  test("test update operation with Badrecords action as force.") {
    sql("""drop table if exists badtable""").show
    sql("""create table badtable (c1 string,c2 int,c3 string,c5 string) STORED AS carbondata""")
    sql(s"""LOAD DATA LOCAL INPATH '$resourcesPath/IUD/badrecord.csv' INTO table badtable""")
    sql("""update badtable d  set (d.c2) = (d.c2 / 1)""").show()
    checkAnswer(
      sql("""select c1,c2,c3,c5 from badtable"""),
      Seq(Row("ravi",null,"kiran","huawei"),Row("manohar",null,"vanam","huawei"))
    )
    sql("""drop table badtable""").show


  }
  test("test update operation with Badrecords action as FAIL.") {
    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION , LoggerAction.FAIL.name())
    sql("""drop table if exists badtable""").show
    sql("""create table badtable (c1 string,c2 int,c3 string,c5 string) STORED AS carbondata""")
    sql(s"""LOAD DATA LOCAL INPATH '$resourcesPath/IUD/badrecord.csv' INTO table badtable""")
    val exec = intercept[Exception] {
      sql("""update badtable d  set (d.c2) = (d.c2 / 1)""").show()
    }
    checkAnswer(
      sql("""select c1,c2,c3,c5 from badtable"""),
      Seq(Row("ravi",2,"kiran","huawei"),Row("manohar",4,"vanam","huawei"))
    )
    sql("""drop table badtable""").show

  }

  override def afterAll {
    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION , LoggerAction.FORCE.name())
  }
}

Example 46

Source File: TestUpdateAndDeleteWithLargeData.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.spark.testsuite.iud

import java.text.SimpleDateFormat

import org.apache.spark.sql.test.util.QueryTest
import org.apache.spark.sql.{DataFrame, Row, SaveMode}
import org.scalatest.BeforeAndAfterAll

import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.util.CarbonProperties

class TestUpdateAndDeleteWithLargeData extends QueryTest with BeforeAndAfterAll {
  var df: DataFrame = _

  override def beforeAll {
    dropTable()
    buildTestData()
  }

  private def buildTestData(): Unit = {

    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy-MM-dd")

    // Simulate data and write to table orders
    import sqlContext.implicits._

    val sdf = new SimpleDateFormat("yyyy-MM-dd")
    df = sqlContext.sparkSession.sparkContext.parallelize(1 to 1500000)
      .map(value => (value, new java.sql.Date(sdf.parse("2015-07-" + (value % 10 + 10)).getTime),
        "china", "aaa" + value, "phone" + 555 * value, "ASD" + (60000 + value), 14999 + value,
        "ordersTable" + value))
      .toDF("o_id", "o_date", "o_country", "o_name",
        "o_phonetype", "o_serialname", "o_salary", "o_comment")
    createTable()

  }

  private def createTable(): Unit = {
    df.write
      .format("carbondata")
      .option("tableName", "orders")
      .option("tempCSV", "true")
      .option("compress", "true")
      .mode(SaveMode.Overwrite)
      .save()
  }

  private def dropTable() = {
    sql("DROP TABLE IF EXISTS orders")

  }

  test("test the update and delete delete functionality for large data") {

    sql(
      """
            update ORDERS set (o_comment) = ('yyy')""").show()
    checkAnswer(sql(
      """select o_comment from orders limit 2 """), Seq(Row("yyy"), Row("yyy")))

    sql("delete from orders where exists (select 1 from orders)")

    checkAnswer(sql(
      """
           SELECT count(*) FROM orders
           """), Row(0))
  }

}

Example 47

Source File: CaseClassDataFrameAPIExample.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.examples

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}

import org.apache.carbondata.examples.util.ExampleUtils

case class People(name: String, occupation: String, id: Int)

object CaseClassDataFrameAPIExample {

  def main(args: Array[String]) {
    val spark = ExampleUtils.createSparkSession("CaseClassDataFrameAPIExample")
    exampleBody(spark)
    spark.close()
  }

  def exampleBody(spark : SparkSession): Unit = {
    val people = List(People("sangeeta", "engineer", 1), People("pallavi", "consultant", 2))
    val peopleRDD: RDD[People] = spark.sparkContext.parallelize(people)
    import spark.implicits._
    val peopleDF: DataFrame = peopleRDD.toDF("name", "occupation", "id")

    // writing data to carbon table
    peopleDF.write
      .format("carbondata")
      .option("tableName", "caseclass_table")
      .option("compress", "true")
      .mode(SaveMode.Overwrite)
      .save()

    spark.sql("SELECT * FROM caseclass_table").show()

    spark.sql("DROP TABLE IF EXISTS caseclass_table")
  }
}

Example 48

Source File: ExternalTableExample.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.examples

import java.io.File

import org.apache.spark.sql.{CarbonEnv, SaveMode, SparkSession}

import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.metadata.CarbonTableIdentifier
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.carbondata.examples.util.ExampleUtils



object ExternalTableExample {

  def main(args: Array[String]) {
    val spark = ExampleUtils.createSparkSession("ExternalTableExample")
    exampleBody(spark)
    spark.close()
  }

  def exampleBody(spark : SparkSession): Unit = {

    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy/MM/dd")

    // Create origin_table
    spark.sql("DROP TABLE IF EXISTS origin_table")
    spark.sql(
      s"""
         | CREATE TABLE origin_table(
         |   shortField SHORT,
         |   intField INT,
         |   bigintField LONG,
         |   doubleField DOUBLE,
         |   stringField STRING,
         |   timestampField TIMESTAMP,
         |   decimalField DECIMAL(18,2),
         |   dateField DATE,
         |   charField CHAR(5),
         |   floatField FLOAT
         | )
         | STORED AS carbondata
       """.stripMargin)

    val rootPath = new File(this.getClass.getResource("/").getPath
                            + "../../../..").getCanonicalPath
    val path = s"$rootPath/examples/spark/src/main/resources/data.csv"

    // load 4 times, each load has 10 rows data
    // scalastyle:off
    (1 to 4).foreach(_ => spark.sql(
      s"""
         | LOAD DATA LOCAL INPATH '$path'
         | INTO TABLE origin_table
         | OPTIONS('HEADER'='true', 'COMPLEX_DELIMITER_LEVEL_1'='#')
       """.stripMargin))
    // scalastyle:on

    // 40 rows
    spark.sql("SELECT count(*) FROM origin_table").show()

    val origin_table_path = CarbonEnv.getTablePath(Some("default"), "origin_table")(spark)

    // Create external_table
    spark.sql("DROP TABLE IF EXISTS external_table")
    spark.sql("CREATE EXTERNAL TABLE external_table STORED AS carbondata" +
              s" LOCATION '$origin_table_path'")
    spark.sql("SELECT count(*) FROM external_table").show()

    // Load 2 times again
    (1 to 2).foreach(_ => spark.sql(
      s"""
         | LOAD DATA LOCAL INPATH '$path'
         | INTO TABLE origin_table
         | OPTIONS('HEADER'='true', 'COMPLEX_DELIMITER_LEVEL_1'='#')
       """.stripMargin))

    spark.sql("SELECT count(*) FROM external_table").show()

    // Drop tables
    spark.sql("DROP TABLE IF EXISTS origin_table")
    spark.sql("DROP TABLE IF EXISTS external_table")
  }
}

Example 49

Source File: HadoopFileExample.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.examples

import java.io.File

import org.apache.hadoop.conf.Configuration
import org.apache.spark.sql.{SaveMode, SparkSession}

import org.apache.carbondata.examples.util.ExampleUtils
import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat}
import org.apache.carbondata.hadoop.CarbonProjection

// scalastyle:off println
object HadoopFileExample {

  def main(args: Array[String]): Unit = {
    val spark = ExampleUtils.createSparkSession("HadoopFileExample")
    val rootPath = new File(this.getClass.getResource("/").getPath
                            + "../../../..").getCanonicalPath
    val storeLocation: String = rootPath + "/examples/spark/target/store/default"
    exampleBody(spark, storeLocation)
    spark.close()
  }

  def exampleBody(spark : SparkSession, storeLocation : String): Unit = {

    import spark.implicits._
    val df = spark.sparkContext.parallelize(1 to 1000)
      .map(x => ("a", "b", x))
      .toDF("c1", "c2", "c3")

    df.write.format("carbondata")
      .option("tableName", "Hadoopfile_table")
      .option("compress", "true")
      .mode(SaveMode.Overwrite).save()

    // read two columns
    val projection = new CarbonProjection
    projection.addColumn("c1")  // column c1
    projection.addColumn("c3")  // column c3
    val conf = new Configuration()

    CarbonInputFormat.setColumnProjection(conf, projection)
    CarbonInputFormat.setDatabaseName(conf, "default")
    CarbonInputFormat.setTableName(conf, "Hadoopfile_table")


    val input = spark.sparkContext.newAPIHadoopFile(s"${storeLocation}/Hadoopfile_table",
      classOf[CarbonTableInputFormat[Array[Object]]],
      classOf[Void],
      classOf[Array[Object]],
      conf)
    val result = input.map(x => x._2.toList).collect
    result.foreach(x => println(x.mkString(", ")))

    // delete carbondata file
    ExampleUtils.cleanSampleCarbonFile(spark, "Hadoopfile_table")
  }
}
// scalastyle:on println

Example 50

Source File: LuceneIndexExample.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.examples

import org.apache.spark.sql.{SaveMode, SparkSession}

import org.apache.carbondata.examples.util.ExampleUtils




object LuceneIndexExample {

  def main(args: Array[String]) {
    val spark = ExampleUtils.createSparkSession("LuceneIndexExample")
    exampleBody(spark)
    spark.close()
  }

  def exampleBody(spark : SparkSession): Unit = {

    // build the test data, please increase the data for more obvious comparison.
    // if set the data is larger than 100M, it will take 10+ mins.
    import scala.util.Random

    import spark.implicits._
    val r = new Random()
    val df = spark.sparkContext.parallelize(1 to 10 * 10 * 1000)
      .map(x => ("which test" + r.nextInt(10000) + " good" + r.nextInt(10),
      "who and name" + x % 8, "city" + x % 50, x % 60))
      .toDF("id", "name", "city", "age")

    spark.sql("DROP TABLE IF EXISTS personTable")
    df.write.format("carbondata")
      .option("tableName", "personTable")
      .option("compress", "true")
      .mode(SaveMode.Overwrite).save()

    // create lucene index on personTable
    spark.sql(
      s"""
         | CREATE INDEX IF NOT EXISTS dm ON TABLE personTable (id, name)
         | AS 'lucene'
      """.stripMargin)

    // 1. Compare the performance:

    def time(code: => Unit): Double = {
      val start = System.currentTimeMillis()
      code
      // return time in second
      (System.currentTimeMillis() - start).toDouble / 1000
    }

    val timeWithoutLuceneIndex = time {

      spark.sql(
        s"""
           | SELECT count(*)
           | FROM personTable where id like '% test1 %'
      """.stripMargin).show()

    }

    val timeWithLuceneIndex = time {

      spark.sql(
        s"""
           | SELECT count(*)
           | FROM personTable where TEXT_MATCH('id:test1')
      """.stripMargin).show()

    }

    // scalastyle:off
    println("time for query on table with lucene index table:" + timeWithLuceneIndex.toString)
    println("time for query on table without lucene index table:" + timeWithoutLuceneIndex.toString)
    // scalastyle:on

    // 2. Search for word "test1" and not "good" in the id field
    spark.sql(
      s"""
         | SELECT id,name
         | FROM personTable where TEXT_MATCH('id:test1 -id:good1')
      """.stripMargin).show(100)

     // 3. TEXT_MATCH_WITH_LIMIT usage:
    spark.sql(
      s"""
         | SELECT id,name
         | FROM personTable where TEXT_MATCH_WITH_LIMIT('id:test1',10)
      """.stripMargin).show()

    spark.sql("DROP TABLE IF EXISTS personTable")
  }
}

Example 51

Source File: CarbonDataFrameExample.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.examples

import org.apache.spark.sql.{SaveMode, SparkSession}

import org.apache.carbondata.examples.util.ExampleUtils

object CarbonDataFrameExample {

  def main(args: Array[String]) {
    val spark = ExampleUtils.createSparkSession("CarbonDataFrameExample")
    exampleBody(spark)
    spark.close()
  }

  def exampleBody(spark : SparkSession): Unit = {
    // Writes Dataframe to CarbonData file:
    import spark.implicits._
    val df = spark.sparkContext.parallelize(1 to 100)
      .map(x => ("a" + x % 10, "b", x))
      .toDF("c1", "c2", "number")

    // Saves dataframe to carbondata file
    df.write
      .format("carbondata")
      .option("tableName", "carbon_df_table")
      .option("partitionColumns", "c1")  // a list of column names
      .mode(SaveMode.Overwrite)
      .save()

    spark.sql(""" SELECT * FROM carbon_df_table """).show()

    spark.sql("SHOW PARTITIONS carbon_df_table").show()

    // Specify schema
    import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
    val customSchema = StructType(Array(
      StructField("c1", StringType),
      StructField("c2", StringType),
      StructField("number", IntegerType)))

    // Reads carbondata to dataframe
    val carbondf = spark.read
      .format("carbondata")
      .schema(customSchema)
      // .option("dbname", "db_name") the system will use "default" as dbname if not set this option
      .option("tableName", "carbon_df_table")
      .load()

    // Dataframe operations
    carbondf.printSchema()
    carbondf.select($"c1", $"number" + 10).show()
    carbondf.filter($"number" > 31).show()

    spark.sql("DROP TABLE IF EXISTS carbon_df_table")
  }
}

Example 52

Source File: SortMergeJoin_7_6.scala From LearningSparkV2 with Apache License 2.0

5 votes

package main.scala.chapter7

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SaveMode
import scala.util.Random

object SortMergeJoin_7_6 {

  // curried function to benchmark any code or function
  def benchmark(name: String)(f: => Unit) {
    val startTime = System.nanoTime
    f
    val endTime = System.nanoTime
    println(s"Time taken in $name: " + (endTime - startTime).toDouble / 1000000000 + " seconds")
  }

  // main class setting the configs
  def main (args: Array[String] ) {

    val spark = SparkSession.builder
        .appName("SortMergeJoin")
        .config("spark.sql.codegen.wholeStage", true)
        .config("spark.sql.join.preferSortMergeJoin", true)
        .config("spark.sql.autoBroadcastJoinThreshold", -1)
        .config("spark.sql.defaultSizeInBytes", 100000)
        .config("spark.sql.shuffle.partitions", 16)
        .getOrCreate ()

    import spark.implicits._

    var states = scala.collection.mutable.Map[Int, String]()
    var items = scala.collection.mutable.Map[Int, String]()
    val rnd = new scala.util.Random(42)

    // initialize states and items purchased
    states += (0 -> "AZ", 1 -> "CO", 2-> "CA", 3-> "TX", 4 -> "NY", 5-> "MI")
    items += (0 -> "SKU-0", 1 -> "SKU-1", 2-> "SKU-2", 3-> "SKU-3", 4 -> "SKU-4", 5-> "SKU-5")
    // create dataframes
    val usersDF = (0 to 100000).map(id => (id, s"user_${id}", s"user_${id}@databricks.com", states(rnd.nextInt(5))))
          .toDF("uid", "login", "email", "user_state")
    val ordersDF = (0 to 100000).map(r => (r, r, rnd.nextInt(100000), 10 * r* 0.2d, states(rnd.nextInt(5)), items(rnd.nextInt(5))))
          .toDF("transaction_id", "quantity", "users_id", "amount", "state", "items")

    usersDF.show(10)
    ordersDF.show(10)

    // do a Join
    val usersOrdersDF = ordersDF.join(usersDF, $"users_id" === $"uid")
    usersOrdersDF.show(10, false)
    usersOrdersDF.cache()
    usersOrdersDF.explain()
    // usersOrdersDF.explain("formated")
    // uncoment to view the SparkUI otherwise the program terminates and shutdowsn the UI
    // Thread.sleep(200000000)
  }
}

Example 53

Source File: DefaultSource.scala From memsql-spark-connector with Apache License 2.0

5 votes

package com.memsql.spark

import org.apache.spark.TaskContext
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.metrics.source.MetricsHandler
import org.apache.spark.sql.sources.{
  BaseRelation,
  CreatableRelationProvider,
  DataSourceRegister,
  RelationProvider
}
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

object DefaultSource {
  val MEMSQL_SOURCE_NAME          = "com.memsql.spark"
  val MEMSQL_SOURCE_NAME_SHORT    = "memsql"
  val MEMSQL_GLOBAL_OPTION_PREFIX = "spark.datasource.memsql."
}

class DefaultSource
    extends RelationProvider
    with DataSourceRegister
    with CreatableRelationProvider
    with LazyLogging {

  override def shortName: String = DefaultSource.MEMSQL_SOURCE_NAME_SHORT

  private def includeGlobalParams(sqlContext: SQLContext,
                                  params: Map[String, String]): Map[String, String] =
    sqlContext.getAllConfs.foldLeft(params)({
      case (params, (k, v)) if k.startsWith(DefaultSource.MEMSQL_GLOBAL_OPTION_PREFIX) =>
        params + (k.stripPrefix(DefaultSource.MEMSQL_GLOBAL_OPTION_PREFIX) -> v)
      case (params, _) => params
    })

  override def createRelation(sqlContext: SQLContext,
                              parameters: Map[String, String]): BaseRelation = {
    val params  = CaseInsensitiveMap(includeGlobalParams(sqlContext, parameters))
    val options = MemsqlOptions(params)
    if (options.disablePushdown) {
      SQLPushdownRule.ensureRemoved(sqlContext.sparkSession)
      MemsqlReaderNoPushdown(MemsqlOptions.getQuery(params), options, sqlContext)
    } else {
      SQLPushdownRule.ensureInjected(sqlContext.sparkSession)
      MemsqlReader(MemsqlOptions.getQuery(params), Nil, options, sqlContext)
    }
  }

  override def createRelation(sqlContext: SQLContext,
                              mode: SaveMode,
                              parameters: Map[String, String],
                              data: DataFrame): BaseRelation = {
    val opts = CaseInsensitiveMap(includeGlobalParams(sqlContext, parameters))
    val conf = MemsqlOptions(opts)

    val table = MemsqlOptions
      .getTable(opts)
      .getOrElse(
        throw new IllegalArgumentException(
          s"To write a dataframe to MemSQL you must specify a table name via the '${MemsqlOptions.TABLE_NAME}' parameter"
        )
      )
    JdbcHelpers.prepareTableForWrite(conf, table, mode, data.schema)
    val isReferenceTable = JdbcHelpers.isReferenceTable(conf, table)
    val partitionWriterFactory =
      if (conf.onDuplicateKeySQL.isEmpty) {
        new LoadDataWriterFactory(table, conf)
      } else {
        new BatchInsertWriterFactory(table, conf)
      }

    val schema        = data.schema
    var totalRowCount = 0L
    data.foreachPartition(partition => {
      val writer = partitionWriterFactory.createDataWriter(schema,
                                                           TaskContext.getPartitionId(),
                                                           0,
                                                           isReferenceTable,
                                                           mode)
      try {
        partition.foreach(record => {
          writer.write(record)
          totalRowCount += 1
        })
        writer.commit()
        MetricsHandler.setRecordsWritten(totalRowCount)
      } catch {
        case e: Exception => {
          writer.abort()
          throw e
        }
      }
    })

    createRelation(sqlContext, parameters)
  }
}

Example 54

Source File: BinaryTypeBenchmark.scala From memsql-spark-connector with Apache License 2.0

5 votes

package com.memsql.spark

import java.sql.{Connection, DriverManager}
import java.util.Properties

import com.github.mrpowers.spark.daria.sql.SparkSessionExt._
import com.memsql.spark.BatchInsertBenchmark.{df, executeQuery}
import org.apache.spark.sql.types.{BinaryType, IntegerType}
import org.apache.spark.sql.{SaveMode, SparkSession}

import scala.util.Random

// BinaryTypeBenchmark is written to writing of the BinaryType with CPU profiler
// this feature is accessible in Ultimate version of IntelliJ IDEA
// see https://www.jetbrains.com/help/idea/async-profiler.html#profile for more details
object BinaryTypeBenchmark extends App {
  final val masterHost: String = sys.props.getOrElse("memsql.host", "localhost")
  final val masterPort: String = sys.props.getOrElse("memsql.port", "5506")

  val spark: SparkSession = SparkSession
    .builder()
    .master("local")
    .config("spark.sql.shuffle.partitions", "1")
    .config("spark.driver.bindAddress", "localhost")
    .config("spark.datasource.memsql.ddlEndpoint", s"${masterHost}:${masterPort}")
    .config("spark.datasource.memsql.database", "testdb")
    .getOrCreate()

  def jdbcConnection: Loan[Connection] = {
    val connProperties = new Properties()
    connProperties.put("user", "root")

    Loan(
      DriverManager.getConnection(
        s"jdbc:mysql://$masterHost:$masterPort",
        connProperties
      ))
  }

  def executeQuery(sql: String): Unit = {
    jdbcConnection.to(conn => Loan(conn.createStatement).to(_.execute(sql)))
  }

  executeQuery("set global default_partitions_per_leaf = 2")
  executeQuery("drop database if exists testdb")
  executeQuery("create database testdb")

  def genRandomByte(): Byte = (Random.nextInt(256) - 128).toByte
  def genRandomRow(): Array[Byte] =
    Array.fill(1000)(genRandomByte())

  val df = spark.createDF(
    List.fill(100000)(genRandomRow()).zipWithIndex,
    List(("data", BinaryType, true), ("id", IntegerType, true))
  )

  val start1 = System.nanoTime()
  df.write
    .format("memsql")
    .mode(SaveMode.Overwrite)
    .save("testdb.LoadData")

  println("Elapsed time: " + (System.nanoTime() - start1) + "ns [LoadData CSV]")

  val start2 = System.nanoTime()
  df.write
    .format("memsql")
    .option("tableKey.primary", "id")
    .option("onDuplicateKeySQL", "id = id")
    .mode(SaveMode.Overwrite)
    .save("testdb.BatchInsert")

  println("Elapsed time: " + (System.nanoTime() - start2) + "ns [BatchInsert]")

  val avroStart = System.nanoTime()
  df.write
    .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT)
    .mode(SaveMode.Overwrite)
    .option(MemsqlOptions.LOAD_DATA_FORMAT, "Avro")
    .save("testdb.AvroSerialization")
  println("Elapsed time: " + (System.nanoTime() - avroStart) + "ns [LoadData Avro] ")
}

Example 55

Source File: LoadDataBenchmark.scala From memsql-spark-connector with Apache License 2.0

5 votes

package com.memsql.spark

import java.sql.{Connection, Date, DriverManager}
import java.time.{Instant, LocalDate}
import java.util.Properties

import org.apache.spark.sql.types._
import com.github.mrpowers.spark.daria.sql.SparkSessionExt._
import org.apache.spark.sql.{SaveMode, SparkSession}

import scala.util.Random

// LoadDataBenchmark is written to test load data with CPU profiler
// this feature is accessible in Ultimate version of IntelliJ IDEA
// see https://www.jetbrains.com/help/idea/async-profiler.html#profile for more details
object LoadDataBenchmark extends App {
  final val masterHost: String = sys.props.getOrElse("memsql.host", "localhost")
  final val masterPort: String = sys.props.getOrElse("memsql.port", "5506")

  val spark: SparkSession = SparkSession
    .builder()
    .master("local")
    .config("spark.sql.shuffle.partitions", "1")
    .config("spark.driver.bindAddress", "localhost")
    .config("spark.datasource.memsql.ddlEndpoint", s"${masterHost}:${masterPort}")
    .config("spark.datasource.memsql.database", "testdb")
    .getOrCreate()

  def jdbcConnection: Loan[Connection] = {
    val connProperties = new Properties()
    connProperties.put("user", "root")

    Loan(
      DriverManager.getConnection(
        s"jdbc:mysql://$masterHost:$masterPort",
        connProperties
      ))
  }

  def executeQuery(sql: String): Unit = {
    jdbcConnection.to(conn => Loan(conn.createStatement).to(_.execute(sql)))
  }

  executeQuery("set global default_partitions_per_leaf = 2")
  executeQuery("drop database if exists testdb")
  executeQuery("create database testdb")

  def genRow(): (Long, Int, Double, String) =
    (Random.nextLong(), Random.nextInt(), Random.nextDouble(), Random.nextString(20))
  val df =
    spark.createDF(
      List.fill(1000000)(genRow()),
      List(("LongType", LongType, true),
           ("IntType", IntegerType, true),
           ("DoubleType", DoubleType, true),
           ("StringType", StringType, true))
    )

  val start = System.nanoTime()
  df.write
    .format("memsql")
    .mode(SaveMode.Append)
    .save("testdb.batchinsert")

  val diff = System.nanoTime() - start
  println("Elapsed time: " + diff + "ns [CSV serialization] ")

  executeQuery("truncate testdb.batchinsert")

  val avroStart = System.nanoTime()
  df.write
    .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT)
    .mode(SaveMode.Append)
    .option(MemsqlOptions.LOAD_DATA_FORMAT, "Avro")
    .save("testdb.batchinsert")
  val avroDiff = System.nanoTime() - avroStart
  println("Elapsed time: " + avroDiff + "ns [Avro serialization] ")
}

Example 56

Source File: ReferenceTableTest.scala From memsql-spark-connector with Apache License 2.0

5 votes

package com.memsql.spark

import com.github.mrpowers.spark.daria.sql.SparkSessionExt._
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.{DataFrame, SaveMode}

import scala.util.Try

class ReferenceTableTest extends IntegrationSuiteBase {

  val childAggregatorHost = "localhost"
  val childAggregatorPort = "5508"

  val dbName                  = "testdb"
  val commonCollectionName    = "test_table"
  val referenceCollectionName = "reference_table"

  override def beforeEach(): Unit = {
    super.beforeEach()

    // Set child aggregator as a dmlEndpoint
    spark.conf
      .set("spark.datasource.memsql.dmlEndpoints", s"${childAggregatorHost}:${childAggregatorPort}")
  }

  def writeToTable(tableName: String): Unit = {
    val df = spark.createDF(
      List(4, 5, 6),
      List(("id", IntegerType, true))
    )
    df.write
      .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT)
      .mode(SaveMode.Append)
      .save(s"${dbName}.${tableName}")
  }

  def readFromTable(tableName: String): DataFrame = {
    spark.read
      .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT)
      .load(s"${dbName}.${tableName}")
  }

  def writeAndReadFromTable(tableName: String): Unit = {
    writeToTable(tableName)
    val dataFrame = readFromTable(tableName)
    val sqlRows   = dataFrame.collect();
    assert(sqlRows.length == 3)
  }

  def dropTable(tableName: String): Unit = executeQuery(s"drop table if exists $dbName.$tableName")

  describe("Success during write operations") {

    it("to common table") {
      dropTable(commonCollectionName)
      executeQuery(
        s"create table if not exists $dbName.$commonCollectionName (id INT NOT NULL, PRIMARY KEY (id))")
      writeAndReadFromTable(commonCollectionName)
    }

    it("to reference table") {
      dropTable(referenceCollectionName)
      executeQuery(
        s"create reference table if not exists $dbName.$referenceCollectionName (id INT NOT NULL, PRIMARY KEY (id))")
      writeAndReadFromTable(referenceCollectionName)
    }
  }

  describe("Success during creating") {

    it("common table") {
      dropTable(commonCollectionName)
      writeAndReadFromTable(commonCollectionName)
    }
  }

  describe("Failure because of") {

    it("database name not specified") {
      spark.conf.set("spark.datasource.memsql.database", "")
      val df = spark.createDF(
        List(4, 5, 6),
        List(("id", IntegerType, true))
      )
      val result = Try {
        df.write
          .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT)
          .mode(SaveMode.Append)
          .save(s"${commonCollectionName}")
      }
      
      assert(SQLHelper.isSQLExceptionWithCode(result.failed.get, List(1046)))
    }
  }
}

Example 57

Source File: BatchInsertBenchmark.scala From memsql-spark-connector with Apache License 2.0

5 votes

package com.memsql.spark

import java.sql.{Connection, Date, DriverManager}
import java.time.LocalDate
import java.util.Properties

import org.apache.spark.sql.types._
import com.github.mrpowers.spark.daria.sql.SparkSessionExt._
import org.apache.spark.sql.{SaveMode, SparkSession}

import scala.util.Random

// BatchInsertBenchmark is written to test batch insert with CPU profiler
// this feature is accessible in Ultimate version of IntelliJ IDEA
// see https://www.jetbrains.com/help/idea/async-profiler.html#profile for more details
object BatchInsertBenchmark extends App {
  final val masterHost: String = sys.props.getOrElse("memsql.host", "localhost")
  final val masterPort: String = sys.props.getOrElse("memsql.port", "5506")

  val spark: SparkSession = SparkSession
    .builder()
    .master("local")
    .config("spark.sql.shuffle.partitions", "1")
    .config("spark.driver.bindAddress", "localhost")
    .config("spark.datasource.memsql.ddlEndpoint", s"${masterHost}:${masterPort}")
    .config("spark.datasource.memsql.database", "testdb")
    .getOrCreate()

  def jdbcConnection: Loan[Connection] = {
    val connProperties = new Properties()
    connProperties.put("user", "root")

    Loan(
      DriverManager.getConnection(
        s"jdbc:mysql://$masterHost:$masterPort",
        connProperties
      ))
  }

  def executeQuery(sql: String): Unit = {
    jdbcConnection.to(conn => Loan(conn.createStatement).to(_.execute(sql)))
  }

  executeQuery("set global default_partitions_per_leaf = 2")
  executeQuery("drop database if exists testdb")
  executeQuery("create database testdb")

  def genDate() =
    Date.valueOf(LocalDate.ofEpochDay(LocalDate.of(2001, 4, 11).toEpochDay + Random.nextInt(10000)))
  def genRow(): (Long, Int, Double, String, Date) =
    (Random.nextLong(), Random.nextInt(), Random.nextDouble(), Random.nextString(20), genDate())
  val df =
    spark.createDF(
      List.fill(1000000)(genRow()),
      List(("LongType", LongType, true),
           ("IntType", IntegerType, true),
           ("DoubleType", DoubleType, true),
           ("StringType", StringType, true),
           ("DateType", DateType, true))
    )

  val start = System.nanoTime()
  df.write
    .format("memsql")
    .option("tableKey.primary", "IntType")
    .option("onDuplicateKeySQL", "IntType = IntType")
    .mode(SaveMode.Append)
    .save("testdb.batchinsert")

  val diff = System.nanoTime() - start
  println("Elapsed time: " + diff + "ns")
}

Example 58

Source File: IssuesTest.scala From memsql-spark-connector with Apache License 2.0

5 votes

package com.memsql.spark

import com.github.mrpowers.spark.daria.sql.SparkSessionExt._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

class IssuesTest extends IntegrationSuiteBase {
  it("https://github.com/memsql/memsql-spark-connector/issues/41") {
    executeQuery("""
        | create table if not exists testdb.issue41 (
        |   start_video_pos smallint(5) unsigned DEFAULT NULL
        | )
        |""".stripMargin)

    val df = spark.createDF(
      List(1.toShort, 2.toShort, 3.toShort, 4.toShort),
      List(("start_video_pos", ShortType, true))
    )
    df.write.format("memsql").mode(SaveMode.Append).save("issue41")

    val df2 = spark.read.format("memsql").load("issue41")
    assertSmallDataFrameEquality(df2,
                                 spark.createDF(
                                   List(1, 2, 3, 4),
                                   List(("start_video_pos", IntegerType, true))
                                 ))
  }

  it("https://memsql.zendesk.com/agent/tickets/10451") {
    // parallel read should support columnar scan with filter
    executeQuery("""
      | create table if not exists testdb.ticket10451 (
      |   t text,
      |   h bigint(20) DEFAULT NULL,
      |   KEY h (h) USING CLUSTERED COLUMNSTORE
      | )
      | """.stripMargin)

    val df = spark.createDF(
      List(("hi", 2L), ("hi", 3L), ("foo", 4L)),
      List(("t", StringType, true), ("h", LongType, true))
    )
    df.write.format("memsql").mode(SaveMode.Append).save("ticket10451")

    val df2 = spark.read
      .format("memsql")
      .load("ticket10451")
      .where(col("t") === "hi")
      .where(col("h") === 3L)

    assert(df2.rdd.getNumPartitions > 1)
    assertSmallDataFrameEquality(df2,
                                 spark.createDF(
                                   List(("hi", 3L)),
                                   List(("t", StringType, true), ("h", LongType, true))
                                 ))
  }

  it("supports reading count from query") {
    val df = spark.createDF(
      List((1, "Albert"), (5, "Ronny"), (7, "Ben"), (9, "David")),
      List(("id", IntegerType, true), ("name", StringType, true))
    )
    writeTable("testdb.testcount", df)
    val data = spark.read
      .format("memsql")
      .option("query", "select count(1) from testcount where id > 1 ")
      .option("database", "testdb")
      .load()
      .collect()
    val count = data.head.getLong(0)
    assert(count == 3)
  }

  it("handles exceptions raised by asCode") {
    // in certain cases asCode will raise NullPointerException due to this bug
    // https://issues.apache.org/jira/browse/SPARK-31403
    writeTable("testdb.nulltest",
               spark.createDF(
                 List(1, null),
                 List(("i", IntegerType, true))
               ))
    spark.sql(s"create table nulltest using memsql options ('dbtable'='testdb.nulltest')")

    val df2 = spark.sql("select if(isnull(i), null, 2) as x from nulltest order by i")

    assertSmallDataFrameEquality(df2,
                                 spark.createDF(
                                   List(null, 2),
                                   List(("x", IntegerType, true))
                                 ))
  }
}

Example 59

Source File: LoadMode.scala From m3d-engine with Apache License 2.0

5 votes

package com.adidas.analytics.util

import org.apache.spark.sql.SaveMode

trait LoadMode {
  def sparkMode: SaveMode
}

object LoadMode {

  case object OverwriteTable extends LoadMode {
    override def sparkMode: SaveMode = SaveMode.Overwrite
  }

  case object OverwritePartitions extends LoadMode {
    override val sparkMode: SaveMode = SaveMode.Overwrite
  }

  case object OverwritePartitionsWithAddedColumns extends LoadMode {
    override val sparkMode: SaveMode = SaveMode.Overwrite
  }

  case object AppendJoinPartitions extends LoadMode {
    override def sparkMode: SaveMode = SaveMode.Append
  }

  case object AppendUnionPartitions extends LoadMode {
    override def sparkMode: SaveMode = SaveMode.Append
  }
}

Example 60

Source File: DefaultSource.scala From spark-dynamodb with Apache License 2.0

5 votes

package com.audienceproject.spark.dynamodb.datasource

import java.util.Optional

import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.sources.v2.reader.DataSourceReader
import org.apache.spark.sql.sources.v2.writer.DataSourceWriter
import org.apache.spark.sql.sources.v2.{DataSourceOptions, ReadSupport, WriteSupport}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.slf4j.LoggerFactory

import scala.collection.JavaConverters._

class DefaultSource extends ReadSupport with WriteSupport with DataSourceRegister {

    private val logger = LoggerFactory.getLogger(this.getClass)

    override def createReader(schema: StructType, options: DataSourceOptions): DataSourceReader = {
        val optionsMap = options.asMap().asScala
        val defaultParallelism = optionsMap.get("defaultparallelism").map(_.toInt).getOrElse(getDefaultParallelism)
        new DynamoDataSourceReader(defaultParallelism, Map(optionsMap.toSeq: _*), Some(schema))
    }

    override def createReader(options: DataSourceOptions): DataSourceReader = {
        val optionsMap = options.asMap().asScala
        val defaultParallelism = optionsMap.get("defaultparallelism").map(_.toInt).getOrElse(getDefaultParallelism)
        new DynamoDataSourceReader(defaultParallelism, Map(optionsMap.toSeq: _*))
    }

    override def createWriter(writeUUID: String, schema: StructType, mode: SaveMode, options: DataSourceOptions): Optional[DataSourceWriter] = {
        if (mode == SaveMode.Append || mode == SaveMode.Overwrite)
            throw new IllegalArgumentException(s"DynamoDB data source does not support save modes ($mode)." +
                " Please use option 'update' (true | false) to differentiate between append/overwrite and append/update behavior.")
        val optionsMap = options.asMap().asScala
        val defaultParallelism = optionsMap.get("defaultparallelism").map(_.toInt).getOrElse(getDefaultParallelism)
        val writer = new DynamoDataSourceWriter(defaultParallelism, Map(optionsMap.toSeq: _*), schema)
        Optional.of(writer)
    }

    override def shortName(): String = "dynamodb"

    private def getDefaultParallelism: Int =
        SparkSession.getActiveSession match {
            case Some(spark) => spark.sparkContext.defaultParallelism
            case None =>
                logger.warn("Unable to read defaultParallelism from SparkSession." +
                    " Parallelism will be 1 unless overwritten with option `defaultParallelism`")
                1
        }

}

Example 61

Source File: BigQuerySinkLog.scala From spark-bigquery with Apache License 2.0

5 votes

package com.samelamin.spark.bigquery.streaming

import org.apache.hadoop.fs.Path
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.functions._


class BigQuerySinkLog(sparkSession: SparkSession, path: String) {

  def getLatest(): Option[Long] = {
    try {
      import sparkSession.implicits._
      val df = sparkSession.read.json(path).as[Long]
      df.show()
      val latest: Long = df
        .sort(desc("inserted_batches"))
        .first()
      return Some(latest)
    } catch {
      case e: Exception => None
    }
  }

  def writeBatch(batchId: Long):Unit = {
    import sparkSession.implicits._
    val df = Seq(batchId).toDF("inserted_batches").as[Long]
    df.write.mode(SaveMode.Overwrite).json(path)
  }
}

Example 62

Source File: MQTTStreamSink.scala From bahir with Apache License 2.0

5 votes

package org.apache.bahir.sql.streaming.mqtt

import scala.collection.JavaConverters._
import scala.collection.mutable

import org.eclipse.paho.client.mqttv3.MqttException

import org.apache.spark.SparkEnv
import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister}
import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport}
import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory, WriterCommitMessage}
import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.StructType

import org.apache.bahir.utils.Logging
import org.apache.bahir.utils.Retry


class MQTTStreamWriter (schema: StructType, parameters: DataSourceOptions)
    extends StreamWriter with Logging {
  override def createWriterFactory(): DataWriterFactory[InternalRow] = {
    // Skipping client identifier as single batch can be distributed to multiple
    // Spark worker process. MQTT server does not support two connections
    // declaring same client ID at given point in time.
    val params = parameters.asMap().asScala.filterNot(
      _._1.equalsIgnoreCase("clientId")
    )
    MQTTDataWriterFactory(params)
  }

  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}

  override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}
}

case class MQTTDataWriterFactory(config: mutable.Map[String, String])
    extends DataWriterFactory[InternalRow] {
  override def createDataWriter(
    partitionId: Int, taskId: Long, epochId: Long
  ): DataWriter[InternalRow] = new MQTTDataWriter(config)
}

case object MQTTWriterCommitMessage extends WriterCommitMessage

class MQTTDataWriter(config: mutable.Map[String, String]) extends DataWriter[InternalRow] {
  private lazy val publishAttempts: Int =
    SparkEnv.get.conf.getInt("spark.mqtt.client.publish.attempts", -1)
  private lazy val publishBackoff: Long =
    SparkEnv.get.conf.getTimeAsMs("spark.mqtt.client.publish.backoff", "5s")

  private lazy val (_, _, topic, _, _, qos, _, _, _) = MQTTUtils.parseConfigParams(config.toMap)

  override def write(record: InternalRow): Unit = {
    val client = CachedMQTTClient.getOrCreate(config.toMap)
    val message = record.getBinary(0)
    Retry(publishAttempts, publishBackoff, classOf[MqttException]) {
      // In case of errors, retry sending the message.
      client.publish(topic, message, qos, false)
    }
  }

  override def commit(): WriterCommitMessage = MQTTWriterCommitMessage

  override def abort(): Unit = {}
}

case class MQTTRelation(override val sqlContext: SQLContext, data: DataFrame)
    extends BaseRelation {
  override def schema: StructType = data.schema
}

class MQTTStreamSinkProvider extends DataSourceV2 with StreamWriteSupport
    with DataSourceRegister with CreatableRelationProvider {
  override def createStreamWriter(queryId: String, schema: StructType,
      mode: OutputMode, options: DataSourceOptions): StreamWriter = {
    new MQTTStreamWriter(schema, options)
  }

  override def createRelation(sqlContext: SQLContext, mode: SaveMode,
      parameters: Map[String, String], data: DataFrame): BaseRelation = {
    MQTTRelation(sqlContext, data)
  }

  override def shortName(): String = "mqtt"
}

Example 63

Source File: JdbcLoadJob.scala From comet-data-pipeline with Apache License 2.0

5 votes

package com.ebiznext.comet.job.index.jdbcload

import com.ebiznext.comet.config.Settings
import com.ebiznext.comet.utils.{SparkJob, SparkJobResult, Utils}
import com.google.cloud.bigquery.JobInfo.WriteDisposition
import org.apache.spark.sql.SaveMode

import scala.util.Try

class JdbcLoadJob(
  cliConfig: JdbcLoadConfig
)(implicit val settings: Settings)
    extends SparkJob {

  override def name: String = s"jdbcload-JDBC-${cliConfig.outputTable}"

  val conf = session.sparkContext.hadoopConfiguration
  logger.info(s"JDBC Config $cliConfig")
  val driver = cliConfig.driver
  val url = cliConfig.url
  val user = cliConfig.user
  val password = cliConfig.password
  Class.forName(driver)

  def runJDBC(): Try[SparkJobResult] = {
    val inputPath = cliConfig.sourceFile
    logger.info(s"Input path $inputPath")
    Try {
      val sourceDF =
        inputPath match {
          case Left(path) => session.read.parquet(path)
          case Right(df)  => df
        }
      sourceDF.write
        .format("jdbc")
        .option("numPartitions", cliConfig.partitions)
        .option("batchsize", cliConfig.batchSize)
        .option("truncate", cliConfig.writeDisposition == WriteDisposition.WRITE_TRUNCATE)
        .option("driver", driver)
        .option("url", url)
        .option("dbtable", cliConfig.outputTable)
        .option("user", user)
        .option("password", password)
        .mode(SaveMode.Append)
        .save()
      SparkJobResult(session)
    }
  }

  
  override def run(): Try[SparkJobResult] = {
    val res = runJDBC()
    Utils.logFailure(res, logger)
  }
}

Example 64

Source File: WriteMode.scala From comet-data-pipeline with Apache License 2.0

5 votes

package com.ebiznext.comet.schema.model

import com.ebiznext.comet.schema.model.WriteMode.{APPEND, ERROR_IF_EXISTS, IGNORE, OVERWRITE}
import com.fasterxml.jackson.core.JsonParser
import com.fasterxml.jackson.databind.annotation.{JsonDeserialize, JsonSerialize}
import com.fasterxml.jackson.databind.ser.std.ToStringSerializer
import com.fasterxml.jackson.databind.{DeserializationContext, JsonDeserializer}
import org.apache.spark.sql.SaveMode


@JsonSerialize(using = classOf[ToStringSerializer])
@JsonDeserialize(using = classOf[WriteDeserializer])
sealed case class WriteMode(value: String) {
  override def toString: String = value

  def toSaveMode: SaveMode = {
    this match {
      case OVERWRITE       => SaveMode.Overwrite
      case APPEND          => SaveMode.Append
      case ERROR_IF_EXISTS => SaveMode.ErrorIfExists
      case IGNORE          => SaveMode.Ignore
      case _ =>
        throw new Exception("Should never happen")
    }
  }
}

object WriteMode {

  def fromString(value: String): WriteMode = {
    value.toUpperCase() match {
      case "OVERWRITE"       => WriteMode.OVERWRITE
      case "APPEND"          => WriteMode.APPEND
      case "ERROR_IF_EXISTS" => WriteMode.ERROR_IF_EXISTS
      case "IGNORE"          => WriteMode.IGNORE
      case _ =>
        throw new Exception(s"Invalid Write Mode try one of ${writes}")
    }
  }

  object OVERWRITE extends WriteMode("OVERWRITE")

  object APPEND extends WriteMode("APPEND")

  object ERROR_IF_EXISTS extends WriteMode("ERROR_IF_EXISTS")

  object IGNORE extends WriteMode("IGNORE")

  val writes: Set[WriteMode] = Set(OVERWRITE, APPEND, ERROR_IF_EXISTS, IGNORE)
}

class WriteDeserializer extends JsonDeserializer[WriteMode] {

  override def deserialize(jp: JsonParser, ctx: DeserializationContext): WriteMode = {
    val value = jp.readValueAs[String](classOf[String])
    WriteMode.fromString(value)
  }
}

Example 65

Source File: LibSVMRelationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Example 66

Source File: JdbcRelationProvider.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val partitionColumn = jdbcOptions.partitionColumn
    val lowerBound = jdbcOptions.lowerBound
    val upperBound = jdbcOptions.upperBound
    val numPartitions = jdbcOptions.numPartitions

    val partitionInfo = if (partitionColumn == null) {
      null
    } else {
      JDBCPartitioningInfo(
        partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt)
    }
    val parts = JDBCRelation.columnPartition(partitionInfo)
    JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val url = jdbcOptions.url
    val table = jdbcOptions.table
    val createTableOptions = jdbcOptions.createTableOptions
    val isTruncate = jdbcOptions.isTruncate

    val conn = JdbcUtils.createConnectionFactory(jdbcOptions)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, url, table)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (isTruncate && isCascadingTruncateTable(url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, table)
              saveTable(df, url, table, jdbcOptions)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, table)
              createTable(df.schema, url, table, createTableOptions, conn)
              saveTable(df, url, table, jdbcOptions)
            }

          case SaveMode.Append =>
            saveTable(df, url, table, jdbcOptions)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '$table' already exists. SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(df.schema, url, table, createTableOptions, conn)
        saveTable(df, url, table, jdbcOptions)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
}

Example 67

Source File: ParquetTest.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.parquet

import java.io.File

import scala.reflect.ClassTag
import scala.reflect.runtime.universe.TypeTag

import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.sql.{DataFrame, SaveMode}


  protected def withParquetTable[T <: Product: ClassTag: TypeTag]
      (data: Seq[T], tableName: String)
      (f: => Unit): Unit = {
    withParquetDataFrame(data) { df =>
      sqlContext.registerDataFrameAsTable(df, tableName)
      withTempTable(tableName)(f)
    }
  }

  protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
      data: Seq[T], path: File): Unit = {
    data.toDF().write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath)
  }

  protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
      df: DataFrame, path: File): Unit = {
    df.write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath)
  }

  protected def makePartitionDir(
      basePath: File,
      defaultPartitionName: String,
      partitionCols: (String, Any)*): File = {
    val partNames = partitionCols.map { case (k, v) =>
      val valueString = if (v == null || v == "") defaultPartitionName else v.toString
      s"$k=$valueString"
    }

    val partDir = partNames.foldLeft(basePath) { (parent, child) =>
      new File(parent, child)
    }

    assert(partDir.mkdirs(), s"Couldn't create directory $partDir")
    partDir
  }
}

Example 68

Source File: ModelOutput.scala From aerosolve with Apache License 2.0

5 votes

package com.airbnb.common.ml.strategy.data

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SaveMode}

import com.airbnb.common.ml.strategy.config.TrainingOptions
import com.airbnb.common.ml.strategy.eval.BinaryMetrics
import com.airbnb.common.ml.strategy.params.StrategyParams
import com.airbnb.common.ml.util.HiveManageTable



case class ModelOutput[T](
    id: String,
    params: StrategyParams[T],
    loss: Double,
    evalMetrics: BinaryMetrics,
    holdoutMetrics: BinaryMetrics,
    options: TrainingOptions
) extends HiveManageTable {
  override def toRow(partition: String): Row = {
    Row(
      id.toLong,
      holdoutMetrics.posCount,
      holdoutMetrics.negCount,
      holdoutMetrics.posSugHigher,
      holdoutMetrics.posSugLower,
      holdoutMetrics.negSugHigher,
      holdoutMetrics.negSugLower,
      holdoutMetrics.increasePrecision,
      holdoutMetrics.increaseRecall,
      holdoutMetrics.decreasePrecision,
      holdoutMetrics.decreaseRecall,
      holdoutMetrics.trueRegret,
      holdoutMetrics.trueRegretMedian,
      holdoutMetrics.trueRegret75Percentile,
      holdoutMetrics.falseRegret,
      holdoutMetrics.trueIncreaseMagnitude,
      holdoutMetrics.trueDecreaseMagnitude,
      holdoutMetrics.falseDecreaseMagnitude,
      holdoutMetrics.falseIncreaseMagnitude,
      params.params,
      loss,
      options.toPartialArray,
      partition
    )
  }
}

object ModelOutput {
  lazy val schema = StructType(
    Seq(
      StructField("id", LongType),
      StructField("posCount", IntegerType),
      StructField("negCount", IntegerType),
      StructField("posSugHigher", IntegerType),
      StructField("posSugLower", IntegerType),
      StructField("negSugHigher", IntegerType),
      StructField("negSugLower", IntegerType),

      StructField("increasePrecision", DoubleType),
      StructField("increaseRecall", DoubleType),
      StructField("decreasePrecision", DoubleType),
      StructField("decreaseRecall", DoubleType),

      StructField("trueRegret", DoubleType),
      StructField("trueRegretMedian", DoubleType),
      StructField("trueRegret75Percentile", DoubleType),

      StructField("falseRegret", DoubleType),
      StructField("trueIncreaseMagnitude", DoubleType),
      StructField("trueDecreaseMagnitude", DoubleType),
      StructField("falseDecreaseMagnitude", DoubleType),
      StructField("falseIncreaseMagnitude", DoubleType),

      StructField("params", ArrayType(DoubleType)),
      StructField("loss", DoubleType),
      StructField("options", ArrayType(DoubleType)),

      StructField("model", StringType)
    )
  )

  def save[T](
      hiveContext: HiveContext,
      data: RDD[ModelOutput[T]],
      table: String,
      partition: String
  ): Unit = {
    HiveManageTable.saveRDDToHive(
      hiveContext,
      data,
      table,
      ModelOutput.schema,
      SaveMode.Overwrite,
      "model",
      partition)
  }
}

Example 69

Source File: HiveManageTable.scala From aerosolve with Apache License 2.0

5 votes

package com.airbnb.common.ml.util

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.types.StructType

trait HiveManageTable {
  def toRow(partition: String): Row
}

object HiveManageTable {
  def saveRDDToHive[T <: HiveManageTable](hiveContext: HiveContext,
                                          data: RDD[T],
                                          table: String,
                                          schema: StructType,
                                          mode: SaveMode,
                                          partition: String,
                                          partitionValue: String,
                                          hiveConfig: Map[String, String] = dynamicPartitions):Unit = {
    hiveConfig.foreach {
      case (key, value) =>
        hiveContext.setConf(key, value)
    }

    hiveContext.createDataFrame(data.map(_.toRow(partitionValue)), schema)
      .write
      .mode(mode)
      .partitionBy(partition)
      .insertInto(table)
  }

  lazy val dynamicPartitions = Map(
    "hive.exec.dynamic.partition" -> "true",
    "hive.exec.dynamic.partition.mode" -> "nonstrict"
  )
}

Example 70

Source File: DataFrameToFileWriter.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperations.readwritedataframe.filestorage

import org.apache.spark.SparkException
import ai.deepsense.commons.utils.LoggerForCallerClass
import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.doperations.exceptions.WriteFileException
import ai.deepsense.deeplang.doperations.inout.OutputFileFormatChoice.Csv
import ai.deepsense.deeplang.doperations.inout.OutputStorageTypeChoice
import ai.deepsense.deeplang.doperations.readwritedataframe.{FilePath, FilePathFromLibraryPath, FileScheme}
import ai.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv.CsvSchemaStringifierBeforeCsvWriting
import ai.deepsense.deeplang.exceptions.DeepLangException
import ai.deepsense.deeplang.{ExecutionContext, FileSystemClient}
import org.apache.spark.sql.SaveMode

object DataFrameToFileWriter {

  val logger = LoggerForCallerClass()

  def writeToFile(
      fileChoice: OutputStorageTypeChoice.File,
      context: ExecutionContext,
      dataFrame: DataFrame): Unit = {
    implicit val ctx = context

    val path = FileSystemClient.replaceLeadingTildeWithHomeDirectory(fileChoice.getOutputFile())
    val filePath = FilePath(path)
    val saveMode = if (fileChoice.getShouldOverwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists

    try {
      val preprocessed = fileChoice.getFileFormat() match {
        case csv: Csv => CsvSchemaStringifierBeforeCsvWriting.preprocess(dataFrame)
        case other => dataFrame
      }
      writeUsingProvidedFileScheme(fileChoice, preprocessed, filePath, saveMode)
    } catch {
      case e: SparkException =>
        logger.error(s"WriteDataFrame error: Spark problem. Unable to write file to $path", e)
        throw WriteFileException(path, e)
    }
  }

  private def writeUsingProvidedFileScheme(
      fileChoice: OutputStorageTypeChoice.File, dataFrame: DataFrame, path: FilePath, saveMode: SaveMode
    )(implicit context: ExecutionContext): Unit = {
    import FileScheme._
    path.fileScheme match {
      case Library =>
        val filePath = FilePathFromLibraryPath(path)
        val FilePath(_, libraryPath) = filePath
        new java.io.File(libraryPath).getParentFile.mkdirs()
        writeUsingProvidedFileScheme(fileChoice, dataFrame, filePath, saveMode)
      case FileScheme.File => DriverFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode)
      case HDFS => ClusterFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode)
      case HTTP | HTTPS | FTP => throw NotSupportedScheme(path.fileScheme)
    }
  }

  case class NotSupportedScheme(fileScheme: FileScheme)
    extends DeepLangException(s"Not supported file scheme ${fileScheme.pathPrefix}")

}

Example 71

Source File: WriteDataFrame.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperations

import java.io.IOException
import java.util.Properties

import scala.reflect.runtime.{universe => ru}
import ai.deepsense.commons.utils.Version
import ai.deepsense.deeplang.DOperation.Id
import ai.deepsense.deeplang._
import ai.deepsense.deeplang.documentation.OperationDocumentation
import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.doperations.exceptions.DeepSenseIOException
import ai.deepsense.deeplang.doperations.inout._
import ai.deepsense.deeplang.doperations.readwritedataframe.filestorage.DataFrameToFileWriter
import ai.deepsense.deeplang.doperations.readwritedataframe.googlestorage.DataFrameToGoogleSheetWriter
import ai.deepsense.deeplang.doperations.readwritedataframe.validators.{FilePathHasValidFileScheme, ParquetSupportedOnClusterOnly}
import ai.deepsense.deeplang.inference.{InferContext, InferenceWarnings}
import ai.deepsense.deeplang.params.choice.ChoiceParam
import ai.deepsense.deeplang.params.{Param, Params}
import org.apache.spark.sql.SaveMode

class WriteDataFrame()
  extends DOperation1To0[DataFrame]
  with Params
  with OperationDocumentation {

  override val id: Id = "9e460036-95cc-42c5-ba64-5bc767a40e4e"
  override val name: String = "Write DataFrame"
  override val description: String = "Writes a DataFrame to a file or database"

  override val since: Version = Version(0, 4, 0)

  @transient
  override lazy val tTagTI_0: ru.TypeTag[DataFrame] = ru.typeTag[DataFrame]

  val storageType = ChoiceParam[OutputStorageTypeChoice](
    name = "data storage type",
    description = Some("Storage type."))

  def getStorageType(): OutputStorageTypeChoice = $(storageType)
  def setStorageType(value: OutputStorageTypeChoice): this.type = set(storageType, value)

  val specificParams: Array[Param[_]] = Array(storageType)
  setDefault(storageType, new OutputStorageTypeChoice.File())

  override def execute(dataFrame: DataFrame)(context: ExecutionContext): Unit = {
    import OutputStorageTypeChoice._
    try {
      getStorageType() match {
        case jdbcChoice: Jdbc => writeToJdbc(jdbcChoice, context, dataFrame)
        case googleSheetChoice: GoogleSheet => DataFrameToGoogleSheetWriter.writeToGoogleSheet(
          googleSheetChoice, context, dataFrame
        )
        case fileChoice: File => DataFrameToFileWriter.writeToFile(fileChoice, context, dataFrame)
      }
    } catch {
      case e: IOException =>
        logger.error(s"WriteDataFrame error. Could not write file to designated storage", e)
        throw DeepSenseIOException(e)
    }
  }

  private def writeToJdbc(
      jdbcChoice: OutputStorageTypeChoice.Jdbc,
      context: ExecutionContext,
      dataFrame: DataFrame): Unit = {
    val properties = new Properties()
    properties.setProperty("driver", jdbcChoice.getJdbcDriverClassName)

    val jdbcUrl = jdbcChoice.getJdbcUrl
    val jdbcTableName = jdbcChoice.getJdbcTableName
    val saveMode = if (jdbcChoice.getShouldOverwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists

    dataFrame.sparkDataFrame.write.mode(saveMode).jdbc(jdbcUrl, jdbcTableName, properties)
  }

  override def inferKnowledge(k0: DKnowledge[DataFrame])(context: InferContext): (Unit, InferenceWarnings) = {
    FilePathHasValidFileScheme.validate(this)
    ParquetSupportedOnClusterOnly.validate(this)
    super.inferKnowledge(k0)(context)
  }
}

Example 72

Source File: Main.scala From example-spark-scala-read-and-write-from-hdfs with Apache License 2.0

5 votes

package io.saagie.example.hdfs

import org.apache.log4j.LogManager
import org.apache.spark.sql.{SaveMode, SparkSession}

object Main{

  case class HelloWorld(message: String)

  def main(args: Array[String]): Unit = {

    val log = LogManager.getRootLogger
    // Creation of Spark Session
    val sparkSession = SparkSession.builder().appName("example-spark-scala-read-and-write-from-hdfs").getOrCreate()
    import sparkSession.implicits._

    val hdfs_master = args(0)
    // ====== Creating a dataframe with 1 partition
    val df = Seq(HelloWorld("helloworld")).toDF().coalesce(1)

    // ======= Writing files
    // Writing file as parquet
    df.write.mode(SaveMode.Overwrite).parquet(hdfs_master + "user/hdfs/wiki/testwiki")
    //  Writing file as csv
    df.write.mode(SaveMode.Overwrite).csv(hdfs_master + "user/hdfs/wiki/testwiki.csv")

    // ======= Reading files
    // Reading parquet files
    val df_parquet = sparkSession.read.parquet(hdfs_master + "user/hdfs/wiki/testwiki")
    log.info(df_parquet.show())
    //  Reading csv files
    val df_csv = sparkSession.read.option("inferSchema", "true").csv(hdfs_master + "user/hdfs/wiki/testwiki.csv")
    log.info(df_csv.show())
  }
}

Example 73

Source File: JDBCRelation.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.jdbc

import java.util.Properties

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.Partition
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}


  def columnPartition(partitioning: JDBCPartitioningInfo): Array[Partition] = {
    if (partitioning == null) return Array[Partition](JDBCPartition(null, 0))

    val numPartitions = partitioning.numPartitions
    val column = partitioning.column
    if (numPartitions == 1) return Array[Partition](JDBCPartition(null, 0))
    // Overflow and silliness can happen if you subtract then divide.
    // Here we get a little roundoff, but that's (hopefully) OK.
    val stride: Long = (partitioning.upperBound / numPartitions
                      - partitioning.lowerBound / numPartitions)
    var i: Int = 0
    var currentValue: Long = partitioning.lowerBound
    var ans = new ArrayBuffer[Partition]()
    while (i < numPartitions) {
      val lowerBound = if (i != 0) s"$column >= $currentValue" else null
      currentValue += stride
      val upperBound = if (i != numPartitions - 1) s"$column < $currentValue" else null
      val whereClause =
        if (upperBound == null) {
          lowerBound
        } else if (lowerBound == null) {
          upperBound
        } else {
          s"$lowerBound AND $upperBound"
        }
      ans += JDBCPartition(whereClause, i)
      i = i + 1
    }
    ans.toArray
  }
}

private[sql] case class JDBCRelation(
    url: String,
    table: String,
    parts: Array[Partition],
    properties: Properties = new Properties())(@transient val sqlContext: SQLContext)
  extends BaseRelation
  with PrunedFilteredScan
  with InsertableRelation {

  override val needConversion: Boolean = false

  override val schema: StructType = JDBCRDD.resolveTable(url, table, properties)

  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
    val driver: String = DriverRegistry.getDriverClassName(url)
    // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row]
    JDBCRDD.scanTable(
      sqlContext.sparkContext,
      schema,
      driver,
      url,
      properties,
      table,
      requiredColumns,
      filters,
      parts).asInstanceOf[RDD[Row]]
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    data.write
      .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
      .jdbc(url, table, properties)
  }
}

Example 74

Source File: ParquetTest.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import java.io.File

import scala.reflect.ClassTag
import scala.reflect.runtime.universe.TypeTag

import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}


  protected def withParquetTable[T <: Product: ClassTag: TypeTag]
      (data: Seq[T], tableName: String)
      (f: => Unit): Unit = {
    withParquetDataFrame(data) { df =>
      //注册数据集Seq及临时表名
      _sqlContext.registerDataFrameAsTable(df, tableName)
      withTempTable(tableName)(f)
    }
  }
  //产生Parquet文件
  protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
      data: Seq[T], path: File): Unit = {
       //当数据输出的位置已存在时,覆盖重写
    _sqlContext.createDataFrame(data).write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath)
  }
  //产生Parquet文件
  protected def makeParquetFile[T <: Product: ClassTag: TypeTag](
      df: DataFrame, path: File): Unit = {
       //当数据输出的位置已存在时,重写
    df.write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath)
  }
  //产生分区目录
  protected def makePartitionDir(
      basePath: File,
      defaultPartitionName: String,
      partitionCols: (String, Any)*): File = {
    val partNames = partitionCols.map { case (k, v) =>
      val valueString = if (v == null || v == "") defaultPartitionName else v.toString
      s"$k=$valueString"
    }

    val partDir = partNames.foldLeft(basePath) { (parent, child) =>
      new File(parent, child)
    }

    assert(partDir.mkdirs(), s"Couldn't create directory $partDir")
    partDir
  }
}

Example 75

Source File: IOUtilities.scala From spark-practice with MIT License

5 votes

package probelms.customerInsights

import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import utilities.SparkUtilities


object IOUtilities {

  def readProductsDF(spark:SparkSession,path:String):DataFrame={
    import spark.implicits._
    spark.read.textFile(path)
      .map( line => line.split(CIConstants.DELIMITER))
      .map(fields => new Product(fields(0).toInt, fields(1), fields(2), fields(3),SparkUtilities.convertCurrencyToDouble(fields(4))))
      .toDF()
  }

  def readSalesDF(spark:SparkSession,path:String):DataFrame= {
    import spark.implicits._
    spark.read.textFile(CIConstants.SALES_PATH)
      .map(line => line.split(CIConstants.DELIMITER))
      .map(fields => new Sales(fields(0).toInt,fields(1).toInt,fields(2).toInt, SparkUtilities.getDate(fields(3)), SparkUtilities.convertCurrencyToDouble(fields(4)),fields(5).toInt))
      .toDF()
  }

  def readRefundDF(spark:SparkSession,path:String)={
    import spark.implicits._
    spark.read.textFile(path)
      .map(line => line.split(CIConstants.DELIMITER))
      .map(fields => new Refund(fields(0).toInt,fields(1).toInt,fields(2).toInt,fields(3).toInt, SparkUtilities.getDate(fields(4)), SparkUtilities.convertCurrencyToDouble(fields(5)),fields(6).toInt))
      .toDF()
  }
  def readCustomerDF(spark:SparkSession,path:String)= {
    import spark.implicits._
    spark.read.textFile(path)
      .map( line => line.split(CIConstants.DELIMITER))
      .map(fields => new Customer(fields(0).toInt, fields(1), fields(2), fields(3).toLong))
      .toDF()
  }


  def writeDF(df:DataFrame,path:String):Unit={
    df.repartition(1)
      .write
      .format("csv")
      .option("header","true")
      .mode(SaveMode.Overwrite)
      .save(path)

  }


}

Example 76

Source File: ContextsSpec.scala From mist with Apache License 2.0

5 votes

package mist.api

import org.apache.spark.SparkContext
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.scalatest.{FunSpec, Matchers}

import scala.util._

class ContextsSpec extends FunSpec with Matchers with TestSparkContext {

  import mist.api.ArgsInstances._
  import mist.api.encoding.defaults._
  import mist.api.MistFnSyntax._
  import mist.api.data._
  import mist.api.encoding.JsSyntax._

  it("for spark context") {
   val spJob = arg[Seq[Int]]("nums").onSparkContext(
     (nums: Seq[Int], sp: SparkContext) => {
       sp.parallelize(nums).map(_ * 2).collect()
       "2"
    })
    val res = spJob.invoke(testCtx(JsMap("nums" -> (1 to 10).to[Seq].js)))
    res shouldBe Success("2")
  }

  it("for only sc") {
    val spJob = onSparkContext((sc: SparkContext) => {
      5
    })
    val res = spJob.invoke(testCtx(JsMap.empty))
    res shouldBe Success(5)
  }

  def pathToResource(path: String): String = {
    this.getClass.getClassLoader.getResource(path).getPath
  }

  it("session with hive") {
    System.setSecurityManager(null)
    
    val spJob = onSparkSessionWithHive((spark: SparkSession) => {
      val df = spark.read.json(pathToResource("hive_job_data.json"))
      df.createOrReplaceTempView("temp")
      df.cache()
      spark.sql("DROP TABLE IF EXISTS temp_hive")
      spark.table("temp").write.mode(SaveMode.Overwrite).saveAsTable("temp_hive")

      spark.sql("SELECT MAX(age) AS avg_age FROM temp_hive")
        .take(1)(0).getLong(0)
    })
    spJob.invoke(testCtx(JsMap.empty))
    val res = spJob.invoke(testCtx(JsMap.empty))
    res shouldBe Success(30)
  }

  def testCtx(params: JsMap): FnContext = {
    FnContext(spark, params)
  }

}

Example 77

Source File: CreateHiveTableAsSelectCommand.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import scala.util.control.NonFatal

import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession}
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.command.DataWritingCommand



case class CreateHiveTableAsSelectCommand(
    tableDesc: CatalogTable,
    query: LogicalPlan,
    outputColumns: Seq[Attribute],
    mode: SaveMode)
  extends DataWritingCommand {

  private val tableIdentifier = tableDesc.identifier

  override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
    val catalog = sparkSession.sessionState.catalog
    if (catalog.tableExists(tableIdentifier)) {
      assert(mode != SaveMode.Overwrite,
        s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite")

      if (mode == SaveMode.ErrorIfExists) {
        throw new AnalysisException(s"$tableIdentifier already exists.")
      }
      if (mode == SaveMode.Ignore) {
        // Since the table already exists and the save mode is Ignore, we will just return.
        return Seq.empty
      }

      InsertIntoHiveTable(
        tableDesc,
        Map.empty,
        query,
        overwrite = false,
        ifPartitionNotExists = false,
        outputColumns = outputColumns).run(sparkSession, child)
    } else {
      // TODO ideally, we should get the output data ready first and then
      // add the relation into catalog, just in case of failure occurs while data
      // processing.
      assert(tableDesc.schema.isEmpty)
      catalog.createTable(tableDesc.copy(schema = query.schema), ignoreIfExists = false)

      try {
        // Read back the metadata of the table which was created just now.
        val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier)
        // For CTAS, there is no static partition values to insert.
        val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap
        InsertIntoHiveTable(
          createdTableMeta,
          partition,
          query,
          overwrite = true,
          ifPartitionNotExists = false,
          outputColumns = outputColumns).run(sparkSession, child)
      } catch {
        case NonFatal(e) =>
          // drop the created table.
          catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false)
          throw e
      }
    }

    Seq.empty[Row]
  }

  override def argString: String = {
    s"[Database:${tableDesc.database}}, " +
    s"TableName: ${tableDesc.identifier.table}, " +
    s"InsertIntoHiveTable]"
  }
}

Example 78

Source File: JdbcRelationProvider.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    import JDBCOptions._

    val jdbcOptions = new JDBCOptions(parameters)
    val partitionColumn = jdbcOptions.partitionColumn
    val lowerBound = jdbcOptions.lowerBound
    val upperBound = jdbcOptions.upperBound
    val numPartitions = jdbcOptions.numPartitions

    val partitionInfo = if (partitionColumn.isEmpty) {
      assert(lowerBound.isEmpty && upperBound.isEmpty, "When 'partitionColumn' is not specified, " +
        s"'$JDBC_LOWER_BOUND' and '$JDBC_UPPER_BOUND' are expected to be empty")
      null
    } else {
      assert(lowerBound.nonEmpty && upperBound.nonEmpty && numPartitions.nonEmpty,
        s"When 'partitionColumn' is specified, '$JDBC_LOWER_BOUND', '$JDBC_UPPER_BOUND', and " +
          s"'$JDBC_NUM_PARTITIONS' are also required")
      JDBCPartitioningInfo(
        partitionColumn.get, lowerBound.get, upperBound.get, numPartitions.get)
    }
    val parts = JDBCRelation.columnPartition(partitionInfo)
    JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val options = new JDBCOptions(parameters)
    val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis

    val conn = JdbcUtils.createConnectionFactory(options)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, options)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, options)
              val tableSchema = JdbcUtils.getSchemaOption(conn, options)
              saveTable(df, tableSchema, isCaseSensitive, options)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, options.table)
              createTable(conn, df, options)
              saveTable(df, Some(df.schema), isCaseSensitive, options)
            }

          case SaveMode.Append =>
            val tableSchema = JdbcUtils.getSchemaOption(conn, options)
            saveTable(df, tableSchema, isCaseSensitive, options)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '${options.table}' already exists. SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(conn, df, options)
        saveTable(df, Some(df.schema), isCaseSensitive, options)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
}

Example 79

Source File: SaveIntoDataSourceCommand.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
import org.apache.spark.sql.catalyst.plans.QueryPlan
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.sources.CreatableRelationProvider


case class SaveIntoDataSourceCommand(
    query: LogicalPlan,
    dataSource: CreatableRelationProvider,
    options: Map[String, String],
    mode: SaveMode) extends RunnableCommand {

  override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query)

  override def run(sparkSession: SparkSession): Seq[Row] = {
    dataSource.createRelation(
      sparkSession.sqlContext, mode, options, Dataset.ofRows(sparkSession, query))

    Seq.empty[Row]
  }

  override def simpleString: String = {
    val redacted = SQLConf.get.redactOptions(options)
    s"SaveIntoDataSourceCommand ${dataSource}, ${redacted}, ${mode}"
  }
}

Example 80

Source File: SaveIntoDataSourceCommandSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.SparkConf
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.test.SharedSQLContext

class SaveIntoDataSourceCommandSuite extends SharedSQLContext {

  test("simpleString is redacted") {
    val URL = "connection.url"
    val PASS = "123"
    val DRIVER = "mydriver"

    val dataSource = DataSource(
      sparkSession = spark,
      className = "jdbc",
      partitionColumns = Nil,
      options = Map("password" -> PASS, "url" -> URL, "driver" -> DRIVER))

    val logicalPlanString = dataSource
      .planForWriting(SaveMode.ErrorIfExists, spark.range(1).logicalPlan)
      .treeString(true)

    assert(!logicalPlanString.contains(URL))
    assert(!logicalPlanString.contains(PASS))
    assert(logicalPlanString.contains(DRIVER))
  }
}

Example 81

Source File: DefaultSource.scala From mimir with Apache License 2.0

5 votes

package mimir.exec.spark.datasource.google.spreadsheet

import java.io.File

import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider {
  final val DEFAULT_CREDENTIAL_PATH = "/etc/gdata/credential.p12"

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = {
    createRelation(sqlContext, parameters, null)
  }

  private[spreadsheet] def pathToSheetNames(parameters: Map[String, String]): (String, String) = {
    val path = parameters.getOrElse("path", sys.error("'path' must be specified for spreadsheets."))
    val elems = path.split('/')
    if (elems.length < 2)
      throw new Exception("'path' must be formed like '<spreadsheet>/<worksheet>'")

    (elems(0), elems(1))
  }

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = {
    val (spreadsheetName, worksheetName) = pathToSheetNames(parameters)
    val context = createSpreadsheetContext(parameters)
    createRelation(sqlContext, context, spreadsheetName, worksheetName, schema)
  }


  override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = {
    val (spreadsheetName, worksheetName) = pathToSheetNames(parameters)
    implicit val context = createSpreadsheetContext(parameters)
    val spreadsheet = SparkSpreadsheetService.findSpreadsheet(spreadsheetName)
    if(!spreadsheet.isDefined)
      throw new RuntimeException(s"no such a spreadsheet: $spreadsheetName")

    spreadsheet.get.addWorksheet(worksheetName, data.schema, data.collect().toList, Util.toRowData)
    createRelation(sqlContext, context, spreadsheetName, worksheetName, data.schema)
  }

  private[spreadsheet] def createSpreadsheetContext(parameters: Map[String, String]) = {
    val serviceAccountIdOption = parameters.get("serviceAccountId")
    val credentialPath = parameters.getOrElse("credentialPath", DEFAULT_CREDENTIAL_PATH)
    SparkSpreadsheetService(serviceAccountIdOption, new File(credentialPath))
  }

  private[spreadsheet] def createRelation(sqlContext: SQLContext,
                                           context: SparkSpreadsheetService.SparkSpreadsheetContext,
                                           spreadsheetName: String,
                                           worksheetName: String,
                                           schema: StructType): SpreadsheetRelation =
    if (schema == null) {
      createRelation(sqlContext, context, spreadsheetName, worksheetName, None)
    }
    else {
      createRelation(sqlContext, context, spreadsheetName, worksheetName, Some(schema))
    }

  private[spreadsheet] def createRelation(sqlContext: SQLContext,
                                           context: SparkSpreadsheetService.SparkSpreadsheetContext,
                                           spreadsheetName: String,
                                           worksheetName: String,
                                           schema: Option[StructType]): SpreadsheetRelation =
    SpreadsheetRelation(context, spreadsheetName, worksheetName, schema)(sqlContext)
}

Example 82

Source File: SparkSchemaProvider.scala From mimir with Apache License 2.0

5 votes

package mimir.data

import com.typesafe.scalalogging.LazyLogging
import org.apache.spark.sql.{ DataFrame, SaveMode } 
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.analysis.{ UnresolvedRelation, NoSuchDatabaseException }
import org.apache.spark.sql.execution.command.{ DropTableCommand, CreateDatabaseCommand }

import mimir.Database
import mimir.algebra._
import mimir.exec.spark.{MimirSpark, RAToSpark, RowIndexPlan}

class SparkSchemaProvider(db: Database)
  extends LogicalPlanSchemaProvider
  with MaterializedTableProvider
  with LazyLogging
{

  def listTables(): Seq[ID] = 
  {
    try {
      val tables = 
        MimirSpark.get.sparkSession
                  .catalog
                  .listTables( table.id)
                .collect()
                .map { col => (
                    ID(col.name), 
                    RAToSpark.getMimirType( 
                      RAToSpark.dataTypeFromHiveDataTypeString(col.dataType))
                  ) }
        )
      } else { 
        logger.trace(s"$table doesn't exist")
        None 
      }
    } catch {
      case _:NoSuchDatabaseException => {
        logger.warn("Couldn't find database!!! ($sparkDBName)")
        None
      }
    }
  }

  def logicalplan(table: ID): LogicalPlan =
  {
    RowIndexPlan(
      UnresolvedRelation(TableIdentifier(table.id)), 
      tableSchema(table).get
    ).getPlan(db)
  }

  def createStoredTableAs(data: DataFrame, name: ID)
  {
    data.persist()
        .createOrReplaceTempView(name.id)
    data.write
        .mode(SaveMode.Overwrite)
        .saveAsTable(name.id)
  }

  def dropStoredTable(name: ID)
  {
    DropTableCommand(
      TableIdentifier(name.id, None),//Option(sparkDBName)), 
      true, false, true
    ).run(MimirSpark.get.sparkSession)
  }
}

Example 83

Source File: LogisticRegressionDemo.scala From s4ds with Apache License 2.0

5 votes

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer}
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.SaveMode

case class LabelledDocument(fileName:String, text:String, category:String)

object LogisticRegressionDemo extends App {

  val conf = new SparkConf().setAppName("LrTest")
  val sc = new SparkContext(conf)
  val sqlContext = new SQLContext(sc)
  import sqlContext._
  import sqlContext.implicits._

  val spamText = sc.wholeTextFiles("spam/*")
  val hamText = sc.wholeTextFiles("ham/*")

  val spamDocuments = spamText.map { 
    case (fileName, text) => LabelledDocument(fileName, text, "spam")
  }
  val hamDocuments = hamText.map {
    case (fileName, text) => LabelledDocument(fileName, text, "ham")
  }

  val documentsDF = spamDocuments.union(hamDocuments).toDF
  documentsDF.persist

  val Array(trainDF, testDF) = documentsDF.randomSplit(Array(0.7, 0.3))

  val indexer = new StringIndexer().setInputCol("category").setOutputCol("label")
  val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
  val hasher = new HashingTF().setInputCol("words").setOutputCol("features")
  val lr = new LogisticRegression().setMaxIter(50).setRegParam(0.0)

  val pipeline = new Pipeline().setStages(Array(indexer, tokenizer, hasher, lr))
  val model = pipeline.fit(trainDF)

  val transformedTrain = model.transform(trainDF)
  transformedTrain.persist
  
  val transformedTest = model.transform(testDF)
  transformedTest.persist

  println("in sample misclassified:", transformedTrain.filter($"prediction" !== $"label").count,
    " / ",transformedTrain.count)
  println("out sample misclassified:", transformedTest.filter($"prediction" !== $"label").count,
    " / ",transformedTest.count)

  transformedTrain.select("fileName", "label", "prediction", "probability")
    .write.mode(SaveMode.Overwrite).parquet("transformedTrain.parquet")
  transformedTest.select("fileName", "label", "prediction", "probability")
    .write.mode(SaveMode.Overwrite).parquet("transformedTest.parquet")
}

Example 84

Source File: JDBCRelation.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.jdbc

import java.util.Properties

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.Partition
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}


  def columnPartition(partitioning: JDBCPartitioningInfo): Array[Partition] = {
    if (partitioning == null) return Array[Partition](JDBCPartition(null, 0))

    val numPartitions = partitioning.numPartitions
    val column = partitioning.column
    if (numPartitions == 1) return Array[Partition](JDBCPartition(null, 0))
    // Overflow and silliness can happen if you subtract then divide.
    // Here we get a little roundoff, but that's (hopefully) OK.
    val stride: Long = (partitioning.upperBound / numPartitions
                      - partitioning.lowerBound / numPartitions)
    var i: Int = 0
    var currentValue: Long = partitioning.lowerBound
    var ans = new ArrayBuffer[Partition]()
    while (i < numPartitions) {
      val lowerBound = if (i != 0) s"$column >= $currentValue" else null
      currentValue += stride
      val upperBound = if (i != numPartitions - 1) s"$column < $currentValue" else null
      val whereClause =
        if (upperBound == null) {
          lowerBound
        } else if (lowerBound == null) {
          upperBound
        } else {
          s"$lowerBound AND $upperBound"
        }
      ans += JDBCPartition(whereClause, i)
      i = i + 1
    }
    ans.toArray
  }
}

private[sql] case class JDBCRelation(
    url: String,
    table: String,
    parts: Array[Partition],
    properties: Properties = new Properties())(@transient val sqlContext: SQLContext)
  extends BaseRelation
  with PrunedFilteredScan
  with InsertableRelation {

  override val needConversion: Boolean = false

  override val schema: StructType = JDBCRDD.resolveTable(url, table, properties)

  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
    // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row]
    JDBCRDD.scanTable(
      sqlContext.sparkContext,
      schema,
      url,
      properties,
      table,
      requiredColumns,
      filters,
      parts).asInstanceOf[RDD[Row]]
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    data.write
      .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append)
      .jdbc(url, table, properties)
  }
}

Example 85

Source File: DefaultSource.scala From Spark-MongoDB with Apache License 2.0

5 votes

package com.stratio.datasource.mongodb

import com.stratio.datasource.mongodb.config.MongodbConfigBuilder
import com.stratio.datasource.mongodb.config.MongodbConfig._
import org.apache.spark.sql.SaveMode._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}


class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider{

  override def createRelation(
                               sqlContext: SQLContext,
                               parameters: Map[String, String]): BaseRelation = {

    new MongodbRelation(MongodbConfigBuilder(parseParameters(parameters)).build())(sqlContext)

  }

  override def createRelation(
                               sqlContext: SQLContext,
                               parameters: Map[String, String],
                               schema: StructType): BaseRelation = {

    new MongodbRelation(MongodbConfigBuilder(parseParameters(parameters)).build(), Some(schema))(sqlContext)

  }

  override def createRelation(
                               sqlContext: SQLContext,
                               mode: SaveMode,
                               parameters: Map[String, String],
                               data: DataFrame): BaseRelation = {

    val mongodbRelation = new MongodbRelation(
      MongodbConfigBuilder(parseParameters(parameters)).build(), Some(data.schema))(sqlContext)

    mode match{
      case Append         => mongodbRelation.insert(data, overwrite = false)
      case Overwrite      => mongodbRelation.insert(data, overwrite = true)
      case ErrorIfExists  => if(mongodbRelation.isEmptyCollection) mongodbRelation.insert(data, overwrite = false)
      else throw new UnsupportedOperationException("Writing in a non-empty collection.")
      case Ignore         => if(mongodbRelation.isEmptyCollection) mongodbRelation.insert(data, overwrite = false)
    }

    mongodbRelation
  }

}

Example 86

Source File: PileupTestBase.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.tests.pileup

import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext}
import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
import org.apache.spark.sql.types.{IntegerType, ShortType, StringType, StructField, StructType}
import org.scalatest.{BeforeAndAfter, FunSuite}

class PileupTestBase extends FunSuite
  with DataFrameSuiteBase
  with BeforeAndAfter
  with SharedSparkContext{

  val sampleId = "NA12878.multichrom.md"
  val samResPath: String = getClass.getResource("/multichrom/mdbam/samtools.pileup").getPath
  val referencePath: String = getClass.getResource("/reference/Homo_sapiens_assembly18_chr1_chrM.small.fasta").getPath
  val bamPath: String = getClass.getResource(s"/multichrom/mdbam/${sampleId}.bam").getPath
  val cramPath : String = getClass.getResource(s"/multichrom/mdcram/${sampleId}.cram").getPath
  val tableName = "reads_bam"
  val tableNameCRAM = "reads_cram"

  val schema: StructType = StructType(
    List(
      StructField("contig", StringType, nullable = true),
      StructField("position", IntegerType, nullable = true),
      StructField("reference", StringType, nullable = true),
      StructField("coverage", ShortType, nullable = true),
      StructField("pileup", StringType, nullable = true),
      StructField("quality", StringType, nullable = true)
    )
  )
  before {
    System.setProperty("spark.kryo.registrator", "org.biodatageeks.sequila.pileup.serializers.CustomKryoRegistrator")
    spark
      .conf.set("spark.sql.shuffle.partitions",1) //FIXME: In order to get orderBy in Samtools tests working - related to exchange partitions stage
    spark.sql(s"DROP TABLE IF EXISTS $tableName")
    spark.sql(
      s"""
         |CREATE TABLE $tableName
         |USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource
         |OPTIONS(path "$bamPath")
         |
      """.stripMargin)

    spark.sql(s"DROP TABLE IF EXISTS $tableNameCRAM")
    spark.sql(
      s"""
         |CREATE TABLE $tableNameCRAM
         |USING org.biodatageeks.sequila.datasources.BAM.CRAMDataSource
         |OPTIONS(path "$cramPath", refPath "$referencePath" )
         |
      """.stripMargin)

    val mapToString = (map: Map[Byte, Short]) => {
      if (map == null)
        "null"
      else
        map.map({
          case (k, v) => k.toChar -> v}).mkString.replace(" -> ", ":")
    }

    val byteToString = ((byte: Byte) => byte.toString)

    spark.udf.register("mapToString", mapToString)
    spark.udf.register("byteToString", byteToString)
  }

}

Example 87

Source File: Writer.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.tests.pileup

import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}

object Writer {

  val mapToString = (map: Map[Byte, Short]) => {
    if (map == null)
      "null"
    else
      map.map({
        case (k, v) => k.toChar -> v
      }).toSeq.sortBy(_._1).mkString.replace(" -> ", ":")
  }

  def saveToFile(spark: SparkSession, res: Dataset[Row], path: String) = {
    spark.udf.register("mapToString", mapToString)
    res
      .selectExpr("contig", "pos_start", "pos_end", "ref", "cast(coverage as int)", "mapToString(alts)")
      .coalesce(1)
      .write
      .mode(SaveMode.Overwrite)
      .csv(path)
  }
}

Example 88

Source File: AirplaneDatalake.scala From lighthouse with Apache License 2.0

5 votes

package be.dataminded.lighthouse.demo

import be.dataminded.lighthouse.datalake._
import be.dataminded.lighthouse.spark.Csv
import better.files._
import org.apache.spark.sql.SaveMode

object AirplaneDatalake extends Datalake {

  environment("test") { refs =>
    refs += DataUID("raw", "airplane") -> new FileSystemDataLink(
      resource("data/airplane"),
      Csv,
      SaveMode.ErrorIfExists,
      options = Map("header" -> "true", "inferSchema" -> "true")
    )

    refs += DataUID("raw.weather", "daily") -> new FileSystemDataLink(
      resource("data/weather/daily"),
      Csv,
      SaveMode.ErrorIfExists,
      options = Map("header" -> "true", "inferSchema" -> "true")
    )

    refs += DataUID("raw.weather", "station") -> new FileSystemDataLink(
      resource("data/weather/station"),
      Csv,
      SaveMode.ErrorIfExists,
      options = Map("header" -> "true", "inferSchema" -> "true", "delimiter" -> "|")
    )

    refs += DataUID("clean", "airplane") -> new FileSystemDataLink(file"target/clean/airplane".pathAsString)
    refs += DataUID("clean", "weather")  -> new FileSystemDataLink(file"target/clean/weather/daily".pathAsString)
    refs += DataUID("clean", "stations") -> new FileSystemDataLink(file"target/clean/weather/stations".pathAsString)

    refs += DataUID("master", "view") -> new HiveDataLink(
      file"target/master/airplane/view".pathAsString,
      "default",
      "airplane_view"
    )
  }

  private def resource(path: String): String = Resource.getUrl(path).getPath()
}

Example 89

Source File: FileSystemDataLink.scala From lighthouse with Apache License 2.0

5 votes

package be.dataminded.lighthouse.datalake

import be.dataminded.lighthouse.spark.{Orc, SparkFileFormat}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset, SaveMode}


class FileSystemDataLink(
    val path: LazyConfig[String],
    format: SparkFileFormat = Orc,
    saveMode: SaveMode = SaveMode.Overwrite,
    partitionedBy: List[String] = List.empty,
    options: Map[String, String] = Map.empty,
    schema: Option[StructType] = None
) extends PathBasedDataLink {

  override def doRead(path: String): DataFrame = {
    schema match {
      case Some(s) => spark.read.format(format.toString).options(options).schema(s).load(path)
      case None    => spark.read.format(format.toString).options(options).load(path)
    }
  }

  override def doWrite[T](dataset: Dataset[T], path: String): Unit = {
    dataset.write
      .format(format.toString)
      .partitionBy(partitionedBy: _*)
      .options(options)
      .mode(saveMode)
      .save(path)
  }
}

Example 90

Source File: AvroDataLink.scala From lighthouse with Apache License 2.0

5 votes

package be.dataminded.lighthouse.datalake

import org.apache.spark.sql.{DataFrame, Dataset, SaveMode}

class AvroDataLink(
    val path: LazyConfig[String],
    saveMode: SaveMode = SaveMode.Overwrite,
    partitionedBy: List[String] = List.empty,
    options: Map[String, String] = Map.empty
) extends PathBasedDataLink {

  override def doRead(path: String): DataFrame = {
    spark.read
      .format("com.databricks.spark.avro")
      .options(options)
      .load(path)
  }

  override def doWrite[T](dataset: Dataset[T], path: String): Unit = {
    dataset.write
      .format("com.databricks.spark.avro")
      .partitionBy(partitionedBy: _*)
      .options(options)
      .mode(saveMode)
      .save(path)
  }
}

Example 91

Source File: JdbcDataLinkTest.scala From lighthouse with Apache License 2.0

5 votes

package be.dataminded.lighthouse.datalake

import be.dataminded.lighthouse.common.Database
import be.dataminded.lighthouse.testing.SparkFunSuite
import org.apache.spark.sql.SaveMode
import org.scalatest.BeforeAndAfterAll
import org.scalatest.matchers.should.Matchers

case class test_table(ID: java.lang.Integer, STR: String)

class JdbcDataLinkTest extends SparkFunSuite with Matchers with BeforeAndAfterAll {
  import spark.implicits._

  private val extraOptions = Map("MODE" -> "MYSQL")

  override protected def beforeAll(): Unit = {
    Database.inMemory("test", Map("MODE" -> "MYSQL", "user" -> "sa", "DB_CLOSE_DELAY" -> "-1")).withConnection { con =>
      // Create user and password
      con.createStatement().execute("CREATE USER IF NOT EXISTS TEST PASSWORD 'testpw'")
      con.createStatement().execute("ALTER USER TEST ADMIN TRUE")
      // Create table
      con.createStatement().execute("CREATE TABLE IF NOT EXISTS TEST_TABLE(ID INTEGER PRIMARY KEY, STR VARCHAR(50))")
      // Add initial data to table

      val statement = con.createStatement()
      (0 until 100).foreach(x => statement.addBatch(s"INSERT INTO TEST_TABLE(ID, STR) VALUES($x, '$x')"))
      statement.executeBatch()
    }
  }

  test("Reading JDBC datalink") {
    val jdbcDataLink = new JdbcDataLink(
      url = "jdbc:h2:mem:test",
      username = "TEST",
      password = "testpw",
      driver = "org.h2.Driver",
      extraProperties = extraOptions,
      table = "TEST_TABLE",
      partitionColumn = "ID"
    )
    val data = jdbcDataLink.readAs[test_table]()
    data.collect() should contain theSameElementsAs (0 until 100).map(x => test_table(x, s"$x"))
  }

  test("Append JDBC datalink") {
    val jdbcDataLink = new JdbcDataLink(
      url = "jdbc:h2:mem:test",
      username = "TEST",
      password = "testpw",
      driver = "org.h2.Driver",
      extraProperties = extraOptions,
      table = "TEST_TABLE",
      partitionColumn = "ID",
      saveMode = SaveMode.Append
    )
    val saveData = (100 until 200).map(x => test_table(x, s"$x")).toDS()
    jdbcDataLink.write(saveData)

    val data = jdbcDataLink.readAs[test_table]()
    data.collect() should contain theSameElementsAs (0 until 100).map(x => test_table(x, s"$x")) ++
      (100 until 200).map(x => test_table(x, s"$x"))
  }

  test("Overwrite JDBC datalink") {
    import spark.implicits._

    val jdbcDataLink = new JdbcDataLink(
      url = "jdbc:h2:mem:test",
      username = "TEST",
      password = "testpw",
      driver = "org.h2.Driver",
      extraProperties = extraOptions,
      table = "TEST_TABLE",
      partitionColumn = "ID",
      saveMode = SaveMode.Overwrite
    )
    val saveData = (100 until 200).map(x => test_table(x, s"$x")).toDS()
    jdbcDataLink.write(saveData)

    val data = jdbcDataLink.readAs[test_table]()
    data.collect() should contain theSameElementsAs (100 until 200).map(x => test_table(x, s"$x"))
  }
}

Example 92

Source File: S3Writer.scala From gimel with Apache License 2.0

5 votes

package com.paypal.gimel.s3.writer

import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}

import com.paypal.gimel.logger.Logger
import com.paypal.gimel.s3.conf.{S3ClientConfiguration, S3Configs, S3Constants}

object S3Writer {
  val logger: Logger = Logger()

  
  def write(sparkSession: SparkSession, dataFrame: DataFrame, conf: S3ClientConfiguration): org.apache.spark.sql.DataFrame = {
    def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName()
    logger.info(" @Begin --> " + MethodName)

    System.setProperty(S3Configs.awsServicesEnableV4, "true")

    sparkSession.conf.set(S3Configs.accessId, conf.accessId)
    sparkSession.conf.set(S3Configs.secretKey, conf.secretKey)
    sparkSession.conf.set(S3Configs.s3aClientImpl, conf.s3aImpl)
    sparkSession.conf.set(S3Configs.sslEnabled, conf.sslEnabled)
    sparkSession.conf.set(S3Configs.endPoint, conf.endPoint)
    sparkSession.conf.set(S3Configs.pathStyleAccess, conf.pathStyleAccess)

    val saveMode = conf.saveMode.toLowerCase() match {
      case S3Constants.appendSaveMode =>
        SaveMode.Append
      case S3Constants.overwriteSaveMode =>
        SaveMode.Overwrite
      case S3Constants.ignoreSaveMode =>
        SaveMode.Ignore
      case _ =>
        SaveMode.ErrorIfExists
    }

    conf.objectFormat match {
      case S3Constants.csvFileFormat =>
        dataFrame.write.mode(saveMode)
          .option(S3Constants.delimiter, conf.delimiter)
          .option(S3Constants.inferschema, conf.inferSchema)
          .option(S3Constants.header, conf.header)
          .csv(conf.objectPath)
      case S3Constants.jsonFileformat =>
        dataFrame.write.mode(saveMode).json(conf.objectPath)
      case S3Constants.parquetFileFormat =>
        dataFrame.write.mode(saveMode).parquet((conf.objectPath))
      case _ =>
        dataFrame.write.mode(saveMode).text(conf.objectPath)
    }

    dataFrame
  }
}

Example 93

Source File: Main.scala From spark-gdb with Apache License 2.0

5 votes

package com.esri.app

import com.esri.core.geometry.Polyline
import com.esri.udt.{PointType, PolylineType}
import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.{Logging, SparkConf, SparkContext}



    val sqlContext = new SQLContext(sc)
    val df = sqlContext.read.format("com.esri.gdb")
      .option("path", path)
      .option("name", name)
      .option("numPartitions", "1")
      .load()
    df.printSchema()
    df.registerTempTable(name)
    sqlContext.udf.register("getX", (point: PointType) => point.x)
    sqlContext.udf.register("getY", (point: PointType) => point.y)
    sqlContext.udf.register("line", (point: PointType) => PolylineType({
      val polyline = new Polyline()
      polyline.startPath(point.x - 2, point.y - 2)
      polyline.lineTo(point.x + 2, point.y + 2)
      polyline
    }
    ))
    sqlContext.sql(s"select line(Shape),getX(Shape)-2 as x from $name")
      .write
      .mode(SaveMode.Overwrite)
      .format("json")
      .save(s"/tmp/$name.json")
  } finally {
    sc.stop()
  }

}

Example 94

Source File: SparkBatch.scala From smart-meter with MIT License

5 votes

package com.logimethods.nats.connector.spark.app

import java.util.Properties;
import java.io.File
import java.io.Serializable

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

import org.apache.log4j.{Level, LogManager, PropertyConfigurator}
import org.apache.log4j.Logger

import org.apache.spark.sql.SparkSession

//import com.datastax.spark.connector._
//import com.datastax.spark.connector.cql.CassandraConnector

// @see http://stackoverflow.com/questions/39423131/how-to-use-cassandra-context-in-spark-2-0
// @see https://databricks.com/blog/2016/08/15/how-to-use-sparksession-in-apache-spark-2-0.html
// @see https://dzone.com/articles/cassandra-with-spark-20-building-rest-api
object SparkBatch extends App {
  val logLevel = System.getenv("APP_BATCH_LOG_LEVEL")
  println("APP_BATCH_LOG_LEVEL = " + logLevel)
  if ("DEBUG" != logLevel) {
  	Logger.getLogger("org").setLevel(Level.OFF)
  }
  
  val cassandraUrl = System.getenv("CASSANDRA_URL")
  println("CASSANDRA_URL = " + cassandraUrl)
  
  val sparkMasterUrl = System.getenv("SPARK_MASTER_URL")
  println("SPARK_MASTER_URL = " + sparkMasterUrl)
  
  val spark = SparkSession
    .builder()
    .master(sparkMasterUrl)
    .appName("Smartmeter Batch")
    .config("spark.cassandra.connection.host", cassandraUrl)
    //   .config("spark.sql.warehouse.dir", warehouseLocation)
    //.enableHiveSupport()
    .getOrCreate()
  
  spark
    .read
    .format("org.apache.spark.sql.cassandra")
    .options(Map("keyspace" -> "smartmeter", "table" -> "raw_data"))
    .load
    .createOrReplaceTempView("raw_data")
  
  val rawVoltageData = spark.sql("select * from raw_data")
  rawVoltageData.show(10)
  
  
  // @see http://stackoverflow.com/questions/40324153/what-is-the-best-way-to-insert-update-rows-in-cassandra-table-via-java-spark
  //Save data to Cassandra
  import org.apache.spark.sql.SaveMode
  avgByTransformer.write.format("org.apache.spark.sql.cassandra").options(Map("keyspace" -> "smartmeter", "table" -> "avg_voltage_by_transformer")).mode(SaveMode.Overwrite).save();
}

Example 95

Source File: Sink.scala From modelmatrix with Apache License 2.0

5 votes

package com.collective.modelmatrix.cli

import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext}

import scala.util.{Failure, Success, Try}

sealed trait Sink {
  def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit
}

object Sink {
  private val hive = "hive://(.*)".r
  private val parquet = "parquet://(.*)".r

  def validate(sink: String): Either[String, Unit] = {
    Try(apply(sink)) match {
      case Success(s) => Right(())
      case Failure(err) => Left(s"Unsupported sink type: $sink")
    }
  }

  def apply(sink: String): Sink = sink match {
    case hive(table) => HiveSink(table)
    case parquet(path) => ParquetSink(path)
  }
}

object NoSink extends Sink {
  def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = {
    sys.error(s"Sink is not defined")
  }

  override def toString: String = "Sink is not defined"
}

case class HiveSink(
  tableName: String
) extends Sink {

  def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = {
    df.saveAsTable(tableName, SaveMode.Overwrite)
  }

  override def toString: String =
    s"Hive table: $tableName"
}

case class ParquetSink(
  path: String
) extends Sink {

  def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = {
    df.saveAsParquetFile(path)
  }

  override def toString: String =
    s"Parquet: $path"
}

Example 96

Source File: CouchbaseSink.scala From couchbase-spark-connector with Apache License 2.0

5 votes

package com.couchbase.spark.sql.streaming

import com.couchbase.spark.Logging
import org.apache.spark.sql.{DataFrame, SaveMode}
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.unsafe.types.UTF8String
import org.apache.spark.sql.types.StringType
import com.couchbase.spark.sql._
import com.couchbase.spark._
import com.couchbase.client.core.CouchbaseException
import com.couchbase.client.java.document.JsonDocument
import com.couchbase.client.java.document.json.JsonObject
import scala.concurrent.duration._



class CouchbaseSink(options: Map[String, String]) extends Sink with Logging {

  override def addBatch(batchId: Long, data: DataFrame): Unit = {
    val bucketName = options.get("bucket").orNull
    val idFieldName = options.getOrElse("idField", DefaultSource.DEFAULT_DOCUMENT_ID_FIELD)
    val removeIdField = options.getOrElse("removeIdField", "true").toBoolean
    val timeout = options.get("timeout").map(v => Duration(v.toLong, MILLISECONDS))

    val createDocument = options.get("expiry").map(_.toInt)
      .map(expiry => (id: String, content: JsonObject) => JsonDocument.create(id, expiry, content))
      .getOrElse((id: String, content: JsonObject) => JsonDocument.create(id, content))

    data.toJSON
      .queryExecution
      .toRdd
      .map(_.get(0, StringType).asInstanceOf[UTF8String].toString())
      .map { rawJson =>
          val encoded = JsonObject.fromJson(rawJson)
          val id = encoded.get(idFieldName)

          if (id == null) {
              throw new Exception(s"Could not find ID field $idFieldName in $encoded")
          }

          if (removeIdField) {
              encoded.removeKey(idFieldName)
          }

          createDocument(id.toString, encoded)
      }
      .saveToCouchbase(bucketName, StoreMode.UPSERT, timeout)
  }

}

Example 97

Source File: N1qlSpec.scala From couchbase-spark-connector with Apache License 2.0

5 votes

package com.couchbase.spark.n1ql

import com.couchbase.client.core.CouchbaseException
import com.couchbase.client.java.error.QueryExecutionException
import com.couchbase.client.java.query.N1qlQuery
import org.apache.spark.{SparkConf, SparkContext, SparkException}
import org.apache.spark.sql.sources.EqualTo
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.scalatest._
import com.couchbase.spark._
import com.couchbase.spark.connection.CouchbaseConnection
import com.couchbase.spark.sql.N1QLRelation
import org.apache.spark.sql.types.{StringType, StructField, StructType}

import scala.util.control.NonFatal

class N1qlSpec extends FunSuite with Matchers with BeforeAndAfterAll {

  private val master = "local[2]"
  private val appName = "cb-int-specs1"

  private var spark: SparkSession = _


  override def beforeAll(): Unit = {
    spark = SparkSession
      .builder()
      .master(master)
      .appName(appName)
      .config("spark.couchbase.username", "Administrator")
      .config("spark.couchbase.password", "password")
      // Open 2 buckets as tests below rely on it
      .config("com.couchbase.bucket.default", "")
      .config("com.couchbase.bucket.travel-sample", "")
      .getOrCreate()
  }

  override def afterAll(): Unit = {
    CouchbaseConnection().stop()
    spark.stop()
  }

  test("Creating N1QLRelation with default bucket, when two buckets exist, should fail") {
    assertThrows[IllegalStateException] {
      spark.read
        .format("com.couchbase.spark.sql.DefaultSource")
        .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline")))
        .option("schemaFilter", "`type` = 'airline'")
        .schema(StructType(StructField("name", StringType) :: Nil))
        .load()
    }
  }

  test("Creating N1QLRelation with non-default bucket, when two buckets exist, should succeed") {
    spark.read
      .format("com.couchbase.spark.sql.DefaultSource")
      .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline")))
      .option("schemaFilter", "`type` = 'airline'")
      .option("bucket", "travel-sample")
      .schema(StructType(StructField("name", StringType) :: Nil))
      .load()
  }

  test("N1QL failures should fail the Observable") {
    try {
      spark.sparkContext
        .couchbaseQuery(N1qlQuery.simple("BAD QUERY"), bucketName = "default")
        .collect()
        .foreach(println)
      fail()
    }
    catch {
      case e: SparkException =>
        assert (e.getCause.isInstanceOf[QueryExecutionException])
        val err = e.getCause.asInstanceOf[QueryExecutionException]
        assert (err.getMessage == "syntax error - at QUERY")
      case NonFatal(e) =>
        println(e)
        fail()
    }
  }
}

Example 98

Source File: CouchbaseDataFrameSpec.scala From couchbase-spark-connector with Apache License 2.0

5 votes

package com.couchbase.spark.sql

import com.couchbase.spark.connection.CouchbaseConnection
import org.apache.avro.generic.GenericData.StringType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession}
import org.apache.spark.sql.sources.EqualTo
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest._
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class CouchbaseDataFrameSpec extends FlatSpec with Matchers with BeforeAndAfterAll {

  private val master = "local[2]"
  private val appName = "cb-int-specs1"

  private var spark: SparkSession = null


  override def beforeAll(): Unit = {
    val conf = new SparkConf()
      .setMaster(master)
      .setAppName(appName)
      .set("spark.couchbase.nodes", "127.0.0.1")
      .set("com.couchbase.username", "Administrator")
      .set("com.couchbase.password", "password")
      .set("com.couchbase.bucket.default", "")
      .set("com.couchbase.bucket.travel-sample", "")
    spark = SparkSession.builder().config(conf).getOrCreate()

    loadData()
  }

  override def afterAll(): Unit = {
    CouchbaseConnection().stop()
    spark.stop()
  }

  def loadData(): Unit = {

  }

  "If two buckets are used and the bucket is specified the API" should
    "not fail" in {
    val ssc = spark.sqlContext
    ssc.read.couchbase(EqualTo("type", "airline"), Map("bucket" -> "travel-sample"))
  }

  "The DataFrame API" should "infer the schemas" in {
    val ssc = spark.sqlContext
    import com.couchbase.spark.sql._

    val airline = ssc.read.couchbase(EqualTo("type", "airline"), Map("bucket" -> "travel-sample"))
    val airport = ssc.read.couchbase(EqualTo("type", "airport"), Map("bucket" -> "travel-sample"))
    val route = ssc.read.couchbase(EqualTo("type", "route"), Map("bucket" -> "travel-sample"))
    val landmark = ssc.read.couchbase(EqualTo("type", "landmark"), Map("bucket" -> "travel-sample"))


    airline
      .limit(10)
      .write
      .mode(SaveMode.Overwrite)
      .couchbase(Map("bucket" -> "default"))

    // TODO: validate schemas which are inferred on a field and type basis

  }

  it should "write and ignore" in {
    val ssc = spark.sqlContext
    import com.couchbase.spark.sql._

    // create df, write it twice
    val data = ("Michael", 28, true)
    val df = ssc.createDataFrame(spark.sparkContext.parallelize(Seq(data)))

    df.write
      .mode(SaveMode.Ignore)
      .couchbase(options = Map("idField" -> "_1", "bucket" -> "default"))

    df.write
      .mode(SaveMode.Ignore)
      .couchbase(options = Map("idField" -> "_1", "bucket" -> "default"))
  }

  it should "filter based on a function" in {
    val ssc = spark.sqlContext
    import com.couchbase.spark.sql._

    val airlineBySubstrCountry: DataFrame = ssc.read.couchbase(
      EqualTo("'substr(country, 0, 6)'", "United"), Map("bucket" -> "travel-sample"))

    airlineBySubstrCountry.count() should equal(6797)
  }

}

Example 99

Source File: DefaultSource.scala From spark-google-spreadsheets with Apache License 2.0

5 votes

package com.github.potix2.spark.google.spreadsheets

import java.io.File

import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider {
  final val DEFAULT_CREDENTIAL_PATH = "/etc/gdata/credential.p12"

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = {
    createRelation(sqlContext, parameters, null)
  }

  private[spreadsheets] def pathToSheetNames(parameters: Map[String, String]): (String, String) = {
    val path = parameters.getOrElse("path", sys.error("'path' must be specified for spreadsheets."))
    val elems = path.split('/')
    if (elems.length < 2)
      throw new Exception("'path' must be formed like '<spreadsheet>/<worksheet>'")

    (elems(0), elems(1))
  }

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = {
    val (spreadsheetName, worksheetName) = pathToSheetNames(parameters)
    val context = createSpreadsheetContext(parameters)
    createRelation(sqlContext, context, spreadsheetName, worksheetName, schema)
  }


  override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = {
    val (spreadsheetName, worksheetName) = pathToSheetNames(parameters)
    implicit val context = createSpreadsheetContext(parameters)
    val spreadsheet = SparkSpreadsheetService.findSpreadsheet(spreadsheetName)
    if(!spreadsheet.isDefined)
      throw new RuntimeException(s"no such a spreadsheet: $spreadsheetName")

    spreadsheet.get.addWorksheet(worksheetName, data.schema, data.collect().toList, Util.toRowData)
    createRelation(sqlContext, context, spreadsheetName, worksheetName, data.schema)
  }

  private[spreadsheets] def createSpreadsheetContext(parameters: Map[String, String]) = {
    val serviceAccountIdOption = parameters.get("serviceAccountId")
    val credentialPath = parameters.getOrElse("credentialPath", DEFAULT_CREDENTIAL_PATH)
    SparkSpreadsheetService(serviceAccountIdOption, new File(credentialPath))
  }

  private[spreadsheets] def createRelation(sqlContext: SQLContext,
                                           context: SparkSpreadsheetService.SparkSpreadsheetContext,
                                           spreadsheetName: String,
                                           worksheetName: String,
                                           schema: StructType): SpreadsheetRelation =
    if (schema == null) {
      createRelation(sqlContext, context, spreadsheetName, worksheetName, None)
    }
    else {
      createRelation(sqlContext, context, spreadsheetName, worksheetName, Some(schema))
    }

  private[spreadsheets] def createRelation(sqlContext: SQLContext,
                                           context: SparkSpreadsheetService.SparkSpreadsheetContext,
                                           spreadsheetName: String,
                                           worksheetName: String,
                                           schema: Option[StructType]): SpreadsheetRelation =
    SpreadsheetRelation(context, spreadsheetName, worksheetName, schema)(sqlContext)
}

Example 100

Source File: GraphIO.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.graph.utils
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}

object GraphIO {

  private val DELIMITER = "delimiter"
  private val HEADER = "header"

  private val int2Long = udf[Long, Int](_.toLong)
  private val string2Long = udf[Long, String](_.toLong)
  private val int2Float = udf[Float, Int](_.toFloat)
  private val long2Float = udf[Float, Long](_.toFloat)
  private val double2Float = udf[Float, Double](_.toFloat)
  private val string2Float = udf[Float, String](_.toFloat)

  def convert2Float(df: DataFrame, structField: StructField, tmpSuffix: String): DataFrame = {
    val tmpName = structField.name + tmpSuffix
    structField.dataType match {
      case _: LongType =>
        df.withColumn(tmpName, long2Float(df(structField.name)))
          .drop(structField.name)
          .withColumnRenamed(tmpName, structField.name)
      case _: IntegerType =>
        df.withColumn(tmpName, int2Float(df(structField.name)))
          .drop(structField.name)
          .withColumnRenamed(tmpName, structField.name)
      case _: DoubleType =>
        df.withColumn(tmpName, double2Float(df(structField.name)))
          .drop(structField.name)
          .withColumnRenamed(tmpName, structField.name)
      case _: StringType =>
        df.withColumn(tmpName, string2Float(df(structField.name)))
          .drop(structField.name)
          .withColumnRenamed(tmpName, structField.name)
      case _: FloatType => df
      case t => throw new Exception(s"$t can't convert to Float")
    }
  }

  def convert2Long(df: DataFrame, structField: StructField, tmpSuffix: String): DataFrame = {
    val tmpName = structField.name + tmpSuffix
    structField.dataType match {
      case _: LongType => df
      case _: IntegerType =>
        df.withColumn(tmpName, int2Long(df(structField.name)))
          .drop(structField.name)
          .withColumnRenamed(tmpName, structField.name)
      case _: StringType =>
        df.withColumn(tmpName, string2Long(df(structField.name)))
          .drop(structField.name)
          .withColumnRenamed(tmpName, structField.name)
      case t => throw new Exception(s"$t can't convert to Long")
    }
  }

  def load(input: String, isWeighted: Boolean,
           srcIndex: Int = 0, dstIndex: Int = 1, weightIndex: Int = 2,
           sep: String = " "): DataFrame = {
    val ss = SparkSession.builder().getOrCreate()

    val schema = if (isWeighted) {
      StructType(Seq(
        StructField("src", LongType, nullable = false),
        StructField("dst", LongType, nullable = false),
        StructField("weight", FloatType, nullable = false)
      ))
    } else {
      StructType(Seq(
        StructField("src", LongType, nullable = false),
        StructField("dst", LongType, nullable = false)
      ))
    }
    ss.read
      .option("sep", sep)
      .option("header", "false")
      .schema(schema)
      .csv(input)
  }

  def save(df: DataFrame, output: String, seq: String = "\t"): Unit = {
    df.printSchema()
    df.write
      .mode(SaveMode.Overwrite)
      .option(HEADER, "false")
      .option(DELIMITER, seq)
      .csv(output)
  }

  def defaultCheckpointDir: Option[String] = {
    val sparkContext = SparkContext.getOrCreate()
    sparkContext.getConf.getOption("spark.yarn.stagingDir")
      .map { base =>
        new Path(base, s".sparkStaging/${sparkContext.getConf.getAppId}").toString
      }
  }
}

Example 101

Source File: BigFileDatasource.scala From glow with Apache License 2.0

5 votes

package io.projectglow.sql

import java.net.URI
import java.util.ServiceLoader

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

import io.projectglow.common.{GlowLogging, WithUtils}


  def write(rdd: RDD[Array[Byte]], path: String) {
    val uri = new URI(path)
    uploaders.find(_.canUpload(rdd.sparkContext.hadoopConfiguration, path)) match {
      case Some(uploader) => uploader.upload(rdd, path)
      case None =>
        logger.info(s"Could not find a parallel uploader for $path, uploading from the driver")
        writeFileFromDriver(new Path(uri), rdd)
    }
  }

  private def writeFileFromDriver(path: Path, byteRdd: RDD[Array[Byte]]): Unit = {
    val sc = byteRdd.sparkContext
    val fs = path.getFileSystem(sc.hadoopConfiguration)
    WithUtils.withCloseable(fs.create(path)) { stream =>
      WithUtils.withCachedRDD(byteRdd) { cachedRdd =>
        cachedRdd.count()
        cachedRdd.toLocalIterator.foreach { chunk =>
          stream.write(chunk)
        }
      }
    }
  }
}

Example 102

Source File: BigFileDatasourceSuite.scala From glow with Apache License 2.0

5 votes

package io.projectglow.sql

import java.nio.file.{Files, Paths}

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SaveMode}

class BigFileDatasourceSuite extends GlowBaseTest {
  test("save mode: append") {
    val outFile = Files.createTempFile("tmp", ".tmp").toString
    val e = intercept[RuntimeException] {
      spark
        .emptyDataFrame
        .write
        .mode(SaveMode.Append)
        .format("io.projectglow.sql.DummyBigFileDatasource")
        .save(outFile)
    }
    assert(
      e.getMessage
        .contains("Append mode is not supported by io.projectglow.sql.DummyBigFileDatasource"))
  }

  test("save mode: overwrite") {
    val outDir = Files.createTempDirectory("tmp").toString
    spark
      .emptyDataFrame
      .write
      .mode(SaveMode.Overwrite)
      .format("io.projectglow.sql.DummyBigFileDatasource")
      .save(outDir)

    val filePath = Paths.get(outDir)
    assert(Files.isRegularFile(filePath))
    val writtenBytes = Files.readAllBytes(filePath)
    assert(writtenBytes.toSeq == Seq(0, 1, 2).map(_.toByte))
  }

  test("save mode: error if exists") {
    val outFile = Files.createTempFile("tmp", ".tmp").toString
    val e = intercept[RuntimeException] {
      spark
        .emptyDataFrame
        .write
        .mode(SaveMode.ErrorIfExists)
        .format("io.projectglow.sql.DummyBigFileDatasource")
        .save(outFile)
    }
    assert(e.getMessage.contains(s"Path $outFile already exists"))
  }

  test("save mode: ignore") {
    val outDir = Files.createTempDirectory("tmp").toString
    spark
      .emptyDataFrame
      .write
      .mode(SaveMode.Ignore)
      .format("io.projectglow.sql.DummyBigFileDatasource")
      .save(outDir)

    val dirPath = Paths.get(outDir)
    assert(Files.isDirectory(dirPath))
  }
}

class DummyBigFileDatasource extends BigFileDatasource {
  override def serializeDataFrame(
      options: Map[String, String],
      data: DataFrame): RDD[Array[Byte]] = {
    data.sqlContext.sparkContext.parallelize(Seq(Array(0, 1, 2).map(_.toByte)))
  }
}

Example 103

Source File: CreateZipcodesSpark.scala From apache-spark-test with Apache License 2.0

5 votes

package com.github.dnvriend

import akka.actor.ActorSystem
import akka.event.{ Logging, LoggingAdapter }
import akka.stream.{ ActorMaterializer, Materializer }
import org.apache.spark.sql.{ SaveMode, SparkSession }

import scala.concurrent.ExecutionContext

object CreateZipcodesSpark extends App {
  implicit val system: ActorSystem = ActorSystem()
  implicit val mat: Materializer = ActorMaterializer()
  implicit val ec: ExecutionContext = system.dispatcher
  implicit val log: LoggingAdapter = Logging(system, this.getClass)

  val spark = SparkSession.builder()
    .config("spark.sql.warehouse.dir", "file:/tmp/spark-warehouse")
    .config("spark.cores.max", "4")
    .config("spark.scheduler.mode", "FAIR")
    .config("spark.sql.crossJoin.enabled", "true")
    .master("local[*]") // use as many threads as cores
    .appName("CreateZipcodesSpark").getOrCreate()

  import spark.implicits._

  // define an RDD for the district range
  val districts = spark.sparkContext.parallelize(1000 to 9000).map(_.toString).toDS
  // create temp view
  districts.createOrReplaceTempView("districts")

  // define an RDD with a range for the letters
  val l1 = spark.sparkContext.parallelize('A' to 'Z').map(_.toString).toDS
  l1.createOrReplaceTempView("l1")

  // join the letters
  val letters = spark.sql("SELECT concat(a.value, b.value) letters from l1 a join l1 b")
  // define temp view
  letters.createOrReplaceTempView("letters")

  // define an RDD for the houses
  val houses = spark.sparkContext.makeRDD(1 to 399).toDS
  // create temp view
  houses.createOrReplaceTempView("houses")

  // join letters and houses
  val lettersWithHouseNr = spark.sql(
    """
      |SELECT CONCAT(letters, '-', nr) letterswithhousenr FROM letters
      |JOIN
      |(SELECT format_string("%03d", value) nr FROM houses)
    """.stripMargin
  )
  // create temp view
  lettersWithHouseNr.createOrReplaceTempView("lwh")

  // join the districts with the house numbers
  val tickets = spark.sql("SELECT concat(value, letterswithhousenr) value FROM districts JOIN lwh LIMIT 5000000")
  tickets.write.mode(SaveMode.Overwrite).parquet("/tmp/tickets_spark.parquet")
  shutdown

  def shutdown: Unit = {
    spark.stop()
    system.terminate()
  }
  sys.addShutdownHook(shutdown)
}

Example 104

Source File: DefaultSource.scala From spark-vector with Apache License 2.0

5 votes

package com.actian.spark_vector.sql

import org.apache.spark.sql.{ DataFrame, SQLContext, SaveMode }
import org.apache.spark.sql.sources.{ BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider, SchemaRelationProvider }
import org.apache.spark.sql.types.StructType

import com.actian.spark_vector.util.Logging
import com.actian.spark_vector.vector.VectorJDBC

class DefaultSource extends DataSourceRegister with RelationProvider with SchemaRelationProvider with CreatableRelationProvider with Logging {
  override def shortName(): String = "vector"

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation =
    VectorRelation(TableRef(parameters), sqlContext, parameters)

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation =
    VectorRelation(TableRef(parameters), Some(schema), sqlContext, parameters)

  override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = {
    val tableRef = TableRef(parameters)
    val table = VectorRelation(tableRef, sqlContext, parameters)

    mode match {
      case SaveMode.Overwrite =>
        table.insert(data, true)
      case SaveMode.ErrorIfExists =>
        val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) }
        if (isEmpty) {
          table.insert(data, false)
        } else {
          throw new UnsupportedOperationException("Writing to a non-empty Vector table is not allowed with mode ErrorIfExists.")
        }
      case SaveMode.Append =>
        table.insert(data, false)
      case SaveMode.Ignore =>
        val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) }
        if (isEmpty) {
          table.insert(data, false)
        }
    }

    table
  }
}

Example 105

Source File: SparkSqlRunner.scala From amaterasu with Apache License 2.0

5 votes

package org.apache.amaterasu.executor.execution.actions.runners.spark.SparkSql

import java.io.File

import org.apache.amaterasu.common.execution.actions.Notifier
import org.apache.amaterasu.common.logging.Logging
import org.apache.amaterasu.common.runtime.Environment
import org.apache.commons.io.FilenameUtils
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession}



  def findFileType(folderName: File): Array[String] = {
    // get all the files from a directory
    val files: Array[File] = folderName.listFiles()
    val extensions: Array[String] = files.map(file => FilenameUtils.getExtension(file.toString))
    extensions
  }

}

object SparkSqlRunner {

  def apply(env: Environment,
            jobId: String,
            actionName: String,
            notifier: Notifier,
            sc: SparkContext): SparkSqlRunner = {

    val sparkSqlRunnerObj = new SparkSqlRunner

    sparkSqlRunnerObj.env = env
    sparkSqlRunnerObj.jobId = jobId
    sparkSqlRunnerObj.actionName = actionName
    sparkSqlRunnerObj.notifier = notifier
    sparkSqlRunnerObj.sc = sc
    sparkSqlRunnerObj.spark = SparkSession.builder().config(sc.getConf).enableHiveSupport().getOrCreate()
    sparkSqlRunnerObj
  }
}

Example 106

Source File: SparkSQL.scala From MaxCompute-Spark with Apache License 2.0

5 votes

package com.aliyun.odps.spark.examples.sparksql

import org.apache.spark.sql.{SaveMode, SparkSession}

object SparkSQL {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder()
      .appName("SparkSQL-on-MaxCompute")
      .config("spark.sql.broadcastTimeout", 20 * 60)
      .config("spark.sql.crossJoin.enabled", true)
      .config("odps.exec.dynamic.partition.mode", "nonstrict")
      .config("spark.sql.catalogImplementation", "odps")
      .getOrCreate()

    val project = spark.conf.get("odps.project.name")

    import spark._
    import sqlContext.implicits._
    val tableName = "mc_test_table"
    val ptTableName = "mc_test_pt_table"
    // Drop Create
    sql(s"DROP TABLE IF EXISTS ${tableName}")
    sql(s"DROP TABLE IF EXISTS ${ptTableName}")

    sql(s"CREATE TABLE ${tableName} (name STRING, num BIGINT)")
    sql(s"CREATE TABLE ${ptTableName} (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)")

    val df = spark.sparkContext.parallelize(0 to 99, 2).map(f => {
      (s"name-$f", f)
    }).toDF("name", "num")

    val ptDf = spark.sparkContext.parallelize(0 to 99, 2).map(f => {
      (s"name-$f", f, "2018", "0601")
    }).toDF("name", "num", "pt1", "pt2")

    // 写 普通表
    df.write.insertInto(tableName) // insertInto语义
    df.write.mode("overwrite").insertInto(tableName) // insertOverwrite语义

    // 写 分区表
    // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区
    df.createOrReplaceTempView(s"${ptTableName}_tmp_view")
    sql(s"insert into table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view")
    sql(s"insert overwrite table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view")

    ptDf.write.insertInto(ptTableName) // 动态分区 insertInto语义
    ptDf.write.mode("overwrite").insertInto(ptTableName) // 动态分区 insertOverwrite语义

    // 读 普通表
    val rdf = sql(s"select name, num from $tableName")
    println(s"rdf count, ${rdf.count()}")
    rdf.printSchema()

    // 读 分区表
    val rptdf = sql(s"select name, num, pt1, pt2 from $ptTableName where pt1 = '2018' and pt2 = '0601'")
    println(s"rptdf count, ${rptdf.count()}")
    rptdf.printSchema()
  }
}

Example 107

Source File: DataFrameToFileWriter.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperations.readwritedataframe.filestorage

import org.apache.spark.SparkException
import io.deepsense.commons.utils.LoggerForCallerClass
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.doperations.exceptions.WriteFileException
import io.deepsense.deeplang.doperations.inout.OutputFileFormatChoice.Csv
import io.deepsense.deeplang.doperations.inout.OutputStorageTypeChoice
import io.deepsense.deeplang.doperations.readwritedataframe.{FilePath, FilePathFromLibraryPath, FileScheme}
import io.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv.CsvSchemaStringifierBeforeCsvWriting
import io.deepsense.deeplang.exceptions.DeepLangException
import io.deepsense.deeplang.{ExecutionContext, FileSystemClient}
import org.apache.spark.sql.SaveMode

object DataFrameToFileWriter {

  val logger = LoggerForCallerClass()

  def writeToFile(
      fileChoice: OutputStorageTypeChoice.File,
      context: ExecutionContext,
      dataFrame: DataFrame): Unit = {
    implicit val ctx = context

    val path = FileSystemClient.replaceLeadingTildeWithHomeDirectory(fileChoice.getOutputFile())
    val filePath = FilePath(path)
    val saveMode = if (fileChoice.getShouldOverwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists

    try {
      val preprocessed = fileChoice.getFileFormat() match {
        case csv: Csv => CsvSchemaStringifierBeforeCsvWriting.preprocess(dataFrame)
        case other => dataFrame
      }
      writeUsingProvidedFileScheme(fileChoice, preprocessed, filePath, saveMode)
    } catch {
      case e: SparkException =>
        logger.error(s"WriteDataFrame error: Spark problem. Unable to write file to $path", e)
        throw WriteFileException(path, e)
    }
  }

  private def writeUsingProvidedFileScheme(
      fileChoice: OutputStorageTypeChoice.File, dataFrame: DataFrame, path: FilePath, saveMode: SaveMode
    )(implicit context: ExecutionContext): Unit = {
    import FileScheme._
    path.fileScheme match {
      case Library =>
        val filePath = FilePathFromLibraryPath(path)
        val FilePath(_, libraryPath) = filePath
        new java.io.File(libraryPath).getParentFile.mkdirs()
        writeUsingProvidedFileScheme(fileChoice, dataFrame, filePath, saveMode)
      case FileScheme.File => DriverFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode)
      case HDFS => ClusterFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode)
      case HTTP | HTTPS | FTP => throw NotSupportedScheme(path.fileScheme)
    }
  }

  case class NotSupportedScheme(fileScheme: FileScheme)
    extends DeepLangException(s"Not supported file scheme ${fileScheme.pathPrefix}")

}

Example 108

Source File: WriteDataFrame.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperations

import java.io.IOException
import java.util.Properties

import scala.reflect.runtime.{universe => ru}
import io.deepsense.commons.utils.Version
import io.deepsense.deeplang.DOperation.Id
import io.deepsense.deeplang._
import io.deepsense.deeplang.documentation.OperationDocumentation
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.doperations.exceptions.DeepSenseIOException
import io.deepsense.deeplang.doperations.inout._
import io.deepsense.deeplang.doperations.readwritedataframe.filestorage.DataFrameToFileWriter
import io.deepsense.deeplang.doperations.readwritedataframe.googlestorage.DataFrameToGoogleSheetWriter
import io.deepsense.deeplang.doperations.readwritedataframe.validators.{FilePathHasValidFileScheme, ParquetSupportedOnClusterOnly}
import io.deepsense.deeplang.inference.{InferContext, InferenceWarnings}
import io.deepsense.deeplang.params.choice.ChoiceParam
import io.deepsense.deeplang.params.{Param, Params}
import org.apache.spark.sql.SaveMode

class WriteDataFrame()
  extends DOperation1To0[DataFrame]
  with Params
  with OperationDocumentation {

  override val id: Id = "9e460036-95cc-42c5-ba64-5bc767a40e4e"
  override val name: String = "Write DataFrame"
  override val description: String = "Writes a DataFrame to a file or database"

  override val since: Version = Version(0, 4, 0)

  @transient
  override lazy val tTagTI_0: ru.TypeTag[DataFrame] = ru.typeTag[DataFrame]

  val storageType = ChoiceParam[OutputStorageTypeChoice](
    name = "data storage type",
    description = Some("Storage type."))

  def getStorageType(): OutputStorageTypeChoice = $(storageType)
  def setStorageType(value: OutputStorageTypeChoice): this.type = set(storageType, value)

  val params: Array[Param[_]] = Array(storageType)
  setDefault(storageType, new OutputStorageTypeChoice.File())

  override def execute(dataFrame: DataFrame)(context: ExecutionContext): Unit = {
    import OutputStorageTypeChoice._
    try {
      getStorageType() match {
        case jdbcChoice: Jdbc => writeToJdbc(jdbcChoice, context, dataFrame)
        case googleSheetChoice: GoogleSheet => DataFrameToGoogleSheetWriter.writeToGoogleSheet(
          googleSheetChoice, context, dataFrame
        )
        case fileChoice: File => DataFrameToFileWriter.writeToFile(fileChoice, context, dataFrame)
      }
    } catch {
      case e: IOException =>
        logger.error(s"WriteDataFrame error. Could not write file to designated storage", e)
        throw DeepSenseIOException(e)
    }
  }

  private def writeToJdbc(
      jdbcChoice: OutputStorageTypeChoice.Jdbc,
      context: ExecutionContext,
      dataFrame: DataFrame): Unit = {
    val properties = new Properties()
    properties.setProperty("driver", jdbcChoice.getJdbcDriverClassName)

    val jdbcUrl = jdbcChoice.getJdbcUrl
    val jdbcTableName = jdbcChoice.getJdbcTableName
    val saveMode = if (jdbcChoice.getShouldOverwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists

    dataFrame.sparkDataFrame.write.mode(saveMode).jdbc(jdbcUrl, jdbcTableName, properties)
  }

  override def inferKnowledge(k0: DKnowledge[DataFrame])(context: InferContext): (Unit, InferenceWarnings) = {
    FilePathHasValidFileScheme.validate(this)
    ParquetSupportedOnClusterOnly.validate(this)
    super.inferKnowledge(k0)(context)
  }
}