org.apache.spark.sql.streaming.DataStreamReader Scala Examples

The following examples show how to use org.apache.spark.sql.streaming.DataStreamReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: SparkImplicits.scala From apache-spark-test with Apache License 2.0

5 votes

package com.github.dnvriend.spark.datasources

import java.util.Properties

import akka.NotUsed
import akka.stream.Materializer
import akka.stream.scaladsl.{ Sink, Source }
import org.apache.spark.sql._
import org.apache.spark.sql.streaming.DataStreamReader

import scala.collection.immutable._
import scala.concurrent.duration.{ FiniteDuration, _ }
import scala.concurrent.{ Await, Future }
import scala.reflect.runtime.universe._
import slick.driver.PostgresDriver.api._

object SparkImplicits {
  implicit class DataSourceOps(dfr: DataFrameReader) {
    def helloworld(path: String): DataFrame = dfr.format("helloworld").load(path)
    def person(path: String): DataFrame = dfr.format("person").load(path)
    def jdbc(table: String)(implicit jdbcOptions: Map[String, String]): DataFrame =
      dfr.format("jdbc").options(jdbcOptions ++ Map("dbtable" -> table)).load()
  }

  implicit class DataStreamReaderOps(dsr: DataStreamReader) {
    def currentPersistenceIds(path: String = "jdbc-read-journal"): DataFrame = dsr.format("current-persistence-id").load(path)
    def eventsByPersistenceId(path: String = "jdbc-read-journal"): DataFrame = dsr.format("current-events-by-persistence-id").load(path)
  }

  implicit class DataFrameWriterOps[T](dfw: DataFrameWriter[T]) {
    
    def ignore = dfw.mode(SaveMode.Ignore)

    def jdbc(table: String)(implicit jdbcOptions: Map[String, String]) = {
      val properties = jdbcOptions.foldLeft(new Properties) { case (prop, (k, v)) => prop.put(k, v); prop }
      dfw.jdbc(jdbcOptions("url"), table, properties)
      // does not (yet) work see: https://issues.apache.org/jira/browse/SPARK-7646
      // dfw.format("jdbc").mode(SaveMode.Overwrite).options(jdbcOptions ++ Map("dbtable" -> table))
    }
  }

  trait DataFrameQueryGenerator[A] {
    def upsert: String
  }

  implicit class DatasetOps(df: DataFrame) {
    def withSession[A](db: Database)(f: Session => A): A = {
      val session = db.createSession()
      try f(session) finally session.close()
    }

    def withStatement[A](db: Database)(f: java.sql.Statement => A): A =
      withSession(db)(session ⇒ session.withStatement()(f))

    def upsert[A](table: String)(implicit db: Database, dfq: DataFrameQueryGenerator[A]): DataFrame = withStatement(db) { stmt =>
      stmt.executeUpdate(dfq.upsert)
      df
    }
  }

  implicit class SparkSessionOps(spark: SparkSession) {
    def fromFuture[A <: Product: TypeTag](data: Future[Seq[A]])(implicit _timeout: FiniteDuration = null): DataFrame =
      spark.createDataFrame(Await.result(data, Option(_timeout).getOrElse(15.minutes)))

    def fromSource[A <: Product: TypeTag](data: Source[A, NotUsed])(implicit _timeout: FiniteDuration = null, mat: Materializer): DataFrame =
      fromFuture(data.runWith(Sink.seq))
  }
}