org.apache.spark.sql.ForeachWriter Scala Examples

The following examples show how to use org.apache.spark.sql.ForeachWriter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: DNSstat.scala    From jdbcsink   with Apache License 2.0 6 votes vote down vote up
import org.apache.spark.sql.SparkSession
import java.util.Properties
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.{from_json,window}
import java.sql.{Connection,Statement,DriverManager}
import org.apache.spark.sql.ForeachWriter
import org.apache.spark.sql.Row

class JDBCSink() extends ForeachWriter[Row]{
 val driver = "com.mysql.jdbc.Driver"
      var connection:Connection = _
      var statement:Statement = _

    def open(partitionId: Long,version: Long): Boolean = {
        Class.forName(driver)
        connection = DriverManager.getConnection("jdbc:mysql://10.88.1.102:3306/aptwebservice", "root", "mysqladmin")
        statement = connection.createStatement
        true
      }
      def process(value: Row): Unit = {
        statement.executeUpdate("replace into DNSStat(ip,domain,time,count) values(" 
                                    + "'" + value.getString(0) + "'" + ","//ip
                                    + "'" + value.getString(1) + "'" + ","//domain
                                    + "'" + value.getTimestamp(2) + "'" + "," //time
                                    + value.getLong(3) //count
                                    + ")") 
      }

      def close(errorOrNull: Throwable): Unit = {
        connection.close
      }
}

object DNSstatJob{

val schema: StructType = StructType(
        Seq(StructField("Vendor", StringType,true),
         StructField("Id", IntegerType,true),
         StructField("Time", LongType,true),
         StructField("Conn", StructType(Seq(
                                        StructField("Proto", IntegerType, true), 
                                        StructField("Sport", IntegerType, true), 
                                        StructField("Dport", IntegerType, true), 
                                        StructField("Sip", StringType, true), 
                                        StructField("Dip", StringType, true)
                                        )), true),
        StructField("Dns", StructType(Seq(
                                        StructField("Domain", StringType, true), 
                                        StructField("IpCount", IntegerType, true), 
                                        StructField("Ip", StringType, true) 
                                        )), true)))

    def main(args: Array[String]) {
    val spark=SparkSession
          .builder
          .appName("DNSJob")
          .config("spark.some.config.option", "some-value")
          .getOrCreate()
    import spark.implicits._
    val connectionProperties = new Properties()
    connectionProperties.put("user", "root")
    connectionProperties.put("password", "mysqladmin")
    val bruteForceTab = spark.read
                .jdbc("jdbc:mysql://10.88.1.102:3306/aptwebservice", "DNSTab",connectionProperties)
    bruteForceTab.registerTempTable("DNSTab")
    val lines = spark
          .readStream
          .format("kafka")
          .option("kafka.bootstrap.servers", "10.94.1.110:9092")
          .option("subscribe","xdr")
          //.option("startingOffsets","earliest")
          .option("startingOffsets","latest")
          .load()
          .select(from_json($"value".cast(StringType),schema).as("jsonData"))
    lines.registerTempTable("xdr")
    val filterDNS = spark.sql("select CAST(from_unixtime(xdr.jsonData.Time DIV 1000000) as timestamp) as time,xdr.jsonData.Conn.Sip as sip, xdr.jsonData.Dns.Domain from xdr inner join DNSTab on xdr.jsonData.Dns.domain = DNSTab.domain")
    
    val windowedCounts = filterDNS
                        .withWatermark("time","5 minutes")
                        .groupBy(window($"time", "1 minutes", "1 minutes"),$"sip",$"domain")
                        .count()
                        .select($"sip",$"domain",$"window.start",$"count")

    val writer = new JDBCSink()
    val query = windowedCounts
       .writeStream
        .foreach(writer)
        .outputMode("update")
        .option("checkpointLocation","/checkpoint/")
        .start()
        query.awaitTermination() 
   } 
} 
Example 2
Source File: JDBCSink.scala    From BigData-News   with Apache License 2.0 5 votes vote down vote up
package com.vita.spark

import java.sql.{Connection, ResultSet, SQLException, Statement}

import org.apache.log4j.{LogManager, Logger}
import org.apache.spark.sql.{ForeachWriter, Row}

/**
  * 处理从StructuredStreaming中向mysql中写入数据
  */
class JDBCSink(url: String, username: String, password: String) extends ForeachWriter[Row] {

  var statement: Statement = _
  var resultSet: ResultSet = _
  var connection: Connection = _

  override def open(partitionId: Long, version: Long): Boolean = {
    connection = new MySqlPool(url, username, password).getJdbcConn()
    statement = connection.createStatement();
    print("open")
    return true
  }

  override def process(value: Row): Unit = {
    println("process step one")
    val titleName = value.getAs[String]("titleName").replaceAll("[\\[\\]]", "")
    val count = value.getAs[Long]("count")

    val querySql = "select 1 from webCount where titleName = '" + titleName + "'"
    val insertSql = "insert into webCount(titleName,count) values('" + titleName + "' , '" + count + "')"
    val updateSql = "update webCount set count = " + count + " where titleName = '" + titleName + "'"
    println("process step two")
    try {
      //查看连接是否成功
      var resultSet = statement.executeQuery(querySql)
      if (resultSet.next()) {
        println("updateSql")
        statement.executeUpdate(updateSql)
      } else {
        println("insertSql")
        statement.execute(insertSql)
      }

    } catch {
      case ex: SQLException => {
        println("SQLException")
      }
      case ex: Exception => {
        println("Exception")
      }
      case ex: RuntimeException => {
        println("RuntimeException")
      }
      case ex: Throwable => {
        println("Throwable")
      }
    }
  }

  override def close(errorOrNull: Throwable): Unit = {
    if (statement == null) {
      statement.close()
    }
    if (connection == null) {
      connection.close()
    }
  }
} 
Example 3
Source File: RedisWriteKafkaOffset.scala    From BigData-News   with Apache License 2.0 5 votes vote down vote up
package com.vita.spark.streaming.writer

import com.vita.Constants
import com.vita.redies.RedisSingle
import org.apache.spark.sql.{ForeachWriter, Row}

class RedisWriteKafkaOffset extends ForeachWriter[Row] {
  var redisSingle: RedisSingle = _

  override def open(partitionId: Long, version: Long): Boolean = {
    redisSingle = new RedisSingle()
    redisSingle.init(Constants.IP, Constants.PORT)
    true
  }

  override def process(value: Row): Unit = {
    val offset = value.getAs[String]("offset")
    redisSingle.set(Constants.REDIDS_KEY, offset)
  }

  override def close(errorOrNull: Throwable): Unit = {
    redisSingle.getJedis().close()
    redisSingle.getPool().close()
  }
} 
Example 4
Source File: 4-jdbcsink.scala    From Azure-Databricks-NYC-Taxi-Workshop   with MIT License 5 votes vote down vote up
// Databricks notebook source
// MAGIC %md
// MAGIC # JDBC Sink for Structured Streaming
// MAGIC Structued streaming does not feature a JDBC sink currently.<br>
// MAGIC The following is a custom sink we will use in the lab.

// COMMAND ----------

import java.sql._
import org.apache.spark.sql.ForeachWriter

class JDBCSink(url: String, user:String, pwd:String) extends org.apache.spark.sql.ForeachWriter[org.apache.spark.sql.Row]{
    val driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
    var connection:java.sql.Connection = _
    var statement:java.sql.Statement = _

    def open(partitionId: Long, version: Long):Boolean = {
        Class.forName(driver)
        connection = java.sql.DriverManager.getConnection(url, user, pwd)
        statement = connection.createStatement
        true
    }

    def process(value: org.apache.spark.sql.Row): Unit = {        
    statement.executeUpdate("INSERT INTO chicago_crimes_curated_summary(case_id, primary_type, arrest_made,case_year, case_month, case_day_of_month) VALUES (" 
                 + "'" + value(0) + "'" + "," 
                 + "'" + value(1) + "'" + "," 
                 + "'" + value(2) + "'" + "," 
                 + "'" + value(3) + "'" + "," 
                 + "'" + value(4) + "'" + "," 
                 + "'" + value(5) + "'" + ");")
    }

    def close(errorOrNull:Throwable):Unit = {
        connection.close
    }
}


// COMMAND ---------- 
Example 5
Source File: CassandraForeachWriter.scala    From structured-streaming-application   with Apache License 2.0 5 votes vote down vote up
package knolx.spark

import com.datastax.driver.core.{Cluster, Session}
import knolx.Config.{cassandraHosts, keyspace}
import org.apache.spark.sql.{ForeachWriter, Row}


object CassandraForeachWriter extends Serializable {
  val writeToCassandra = new ForeachWriter[Row] {
    private var cluster: Cluster = _
    private var session: Session = _

    override def process(row: Row): Unit = {
      val word = row.getString(0)
      val count = row.getLong(1)

      session.execute(s"insert into $keyspace.wordcount (word, count) values ('$word', $count);")
    }

    override def close(errorOrNull: Throwable): Unit = {
      session.close()
      session.getCluster.close()
    }

    override def open(partitionId: Long, version: Long): Boolean = {
      cluster = Cluster.builder.addContactPoints(cassandraHosts).build
      session = cluster.newSession()
      true
    }
  }
} 
Example 6
Source File: EventHubsForeachWriter.scala    From azure-event-hubs-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.eventhubs

import com.microsoft.azure.eventhubs.EventData
import org.apache.spark.eventhubs.{ EventHubsConf, EventHubsUtils }
import org.apache.spark.eventhubs.client.Client
import org.apache.spark.eventhubs.utils.MetricPlugin
import org.apache.spark.internal.Logging
import org.apache.spark.sql.ForeachWriter


case class EventHubsForeachWriter(ehConf: EventHubsConf)
    extends ForeachWriter[String]
    with Logging {
  private lazy val metricPlugin: Option[MetricPlugin] = ehConf.metricPlugin()
  var client: Client = _
  var totalMessageSizeInBytes = 0
  var totalMessageCount = 0
  var writerOpenTime = 0L

  def open(partitionId: Long, version: Long): Boolean = {
    log.info(s"open is called. ${EventHubsUtils.getTaskContextSlim}")

    writerOpenTime = System.currentTimeMillis()
    client = EventHubsSourceProvider.clientFactory(ehConf.toMap)(ehConf)
    true
  }

  def process(body: String): Unit = {
    val event = EventData.create(s"$body".getBytes("UTF-8"))
    client.send(event)
    totalMessageCount += 1
    totalMessageSizeInBytes += event.getBytes.length
  }

  def close(errorOrNull: Throwable): Unit = {
    log.info(s"close is called. ${EventHubsUtils.getTaskContextSlim}")

    errorOrNull match {
      case t: Throwable =>
        log.warn(s"an error occurred. eventhub name = ${ehConf.name}, error = ${t.getMessage}")
        closeInner(false)
        throw t
      case _ => closeInner(true)
    }
  }

  private def closeInner(isSuccess: Boolean): Unit = {
    var success = false
    if (client != null) {
      try {
        client.close()
        success = true
      } catch {
        case e: Exception =>
          log.warn(s"an error occurred. eventhub name = ${ehConf.name}, error = ${e.getMessage}")
          throw e
      }
      client = null
    }

    metricPlugin.foreach(
      _.onSendMetric(EventHubsUtils.getTaskContextSlim,
                     ehConf.name,
                     totalMessageCount,
                     totalMessageSizeInBytes,
                     System.currentTimeMillis() - writerOpenTime,
                     isSuccess && success))
  }
} 
Example 7
Source File: KafkaContinuousSourceSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.kafka010

import java.util.Properties
import java.util.concurrent.atomic.AtomicInteger

import org.scalatest.time.SpanSugar._
import scala.collection.mutable
import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Dataset, ForeachWriter, Row}
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution
import org.apache.spark.sql.streaming.{StreamTest, Trigger}
import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}

// Run tests in KafkaSourceSuiteBase in continuous execution mode.
class KafkaContinuousSourceSuite extends KafkaSourceSuiteBase with KafkaContinuousTest

class KafkaContinuousSourceTopicDeletionSuite extends KafkaContinuousTest {
  import testImplicits._

  override val brokerProps = Map("auto.create.topics.enable" -> "false")

  test("subscribing topic by pattern with topic deletions") {
    val topicPrefix = newTopic()
    val topic = topicPrefix + "-seems"
    val topic2 = topicPrefix + "-bad"
    testUtils.createTopic(topic, partitions = 5)
    testUtils.sendMessages(topic, Array("-1"))
    require(testUtils.getLatestOffsets(Set(topic)).size === 5)

    val reader = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", testUtils.brokerAddress)
      .option("kafka.metadata.max.age.ms", "1")
      .option("subscribePattern", s"$topicPrefix-.*")
      .option("failOnDataLoss", "false")

    val kafka = reader.load()
      .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
      .as[(String, String)]
    val mapped = kafka.map(kv => kv._2.toInt + 1)

    testStream(mapped)(
      makeSureGetOffsetCalled,
      AddKafkaData(Set(topic), 1, 2, 3),
      CheckAnswer(2, 3, 4),
      Execute { query =>
        testUtils.deleteTopic(topic)
        testUtils.createTopic(topic2, partitions = 5)
        eventually(timeout(streamingTimeout)) {
          assert(
            query.lastExecution.logical.collectFirst {
              case DataSourceV2Relation(_, r: KafkaContinuousReader) => r
            }.exists { r =>
              // Ensure the new topic is present and the old topic is gone.
              r.knownPartitions.exists(_.topic == topic2)
            },
            s"query never reconfigured to new topic $topic2")
        }
      },
      AddKafkaData(Set(topic2), 4, 5, 6),
      CheckAnswer(2, 3, 4, 5, 6, 7)
    )
  }
}

class KafkaContinuousSourceStressForDontFailOnDataLossSuite
    extends KafkaSourceStressForDontFailOnDataLossSuite {
  override protected def startStream(ds: Dataset[Int]) = {
    ds.writeStream
      .format("memory")
      .queryName("memory")
      .trigger(Trigger.Continuous("1 second"))
      .start()
  }
} 
Example 8
Source File: CassandraSinkForeach.scala    From Spark-Structured-Streaming-Examples   with Apache License 2.0 5 votes vote down vote up
package cassandra.foreachSink

import cassandra.CassandraDriver
import log.LazyLogger
import org.apache.spark.sql.ForeachWriter
import radio.SimpleSongAggregation


class CassandraSinkForeach() extends ForeachWriter[SimpleSongAggregation] with LazyLogger {
  private def cqlRadio(record: SimpleSongAggregation): String = s"""
       insert into ${CassandraDriver.namespace}.${CassandraDriver.foreachTableSink} (title, artist, radio, count)
       values('${record.title}', '${record.artist}', '${record.radio}', ${record.count})"""

  def open(partitionId: Long, version: Long): Boolean = {
    // open connection
    //@TODO command to check if cassandra cluster is up
    true
  }

  //https://github.com/datastax/spark-cassandra-connector/blob/master/doc/1_connecting.md#connection-pooling
  def process(record: SimpleSongAggregation) = {
    log.warn(s"Saving record: $record")
    CassandraDriver.connector.withSessionDo(session =>
      session.execute(cqlRadio(record))
    )
  }

  //https://github.com/datastax/spark-cassandra-connector/blob/master/doc/reference.md#cassandra-connection-parameters

  def close(errorOrNull: Throwable): Unit = {
    // close the connection
    //connection.keep_alive_ms	--> 5000ms :	Period of time to keep unused connections open
  }
} 
Example 9
Source File: StructuredIdentity.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.structuredstreaming.application

import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.ForeachWriter
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

class StructuredIdentity() extends StructuredBenchBase {

  override def process(ds: DataFrame, config: SparkBenchConfig) = {

    // Get the singleton instance of SparkSession
    val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate()
    import spark.implicits._

    val query = ds.writeStream
      .foreach(new ForeachWriter[Row] {
        var reporter: KafkaReporter = _

        def open(partitionId: Long, version: Long): Boolean = {
          val reportTopic = config.reporterTopic
          val brokerList = config.brokerList
          reporter = new KafkaReporter(reportTopic, brokerList)
          true
        }

        def close(errorOrNull: Throwable): Unit = {}

        def process(record: Row): Unit = {
          val inTime = record(0).asInstanceOf[String].toLong
          val outTime = System.currentTimeMillis()
          reporter.report(inTime, outTime)
        }
      })
      .start()

    query.awaitTermination()
  }
} 
Example 10
Source File: StructuredRepartition.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.structuredstreaming.application

import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.structuredstreaming.util.SparkBenchConfig
import org.apache.spark.sql.Column
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.ForeachWriter
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

class StructuredRepartition() extends StructuredBenchBase {

  override def process(ds: DataFrame, config: SparkBenchConfig) = {

    // Get the singleton instance of SparkSession
    val spark = SparkSession.builder.appName("structured " + config.benchName).getOrCreate()
    import spark.implicits._

    val results = ds.repartition(config.coreNumber)
    
    val query = results.writeStream
      .foreach(new ForeachWriter[Row] {
        var reporter: KafkaReporter = _

        def open(partitionId: Long, version: Long): Boolean = {
          val reportTopic = config.reporterTopic
          val brokerList = config.brokerList
          reporter = new KafkaReporter(reportTopic, brokerList)
          true
        }

        def close(errorOrNull: Throwable): Unit = {}

        def process(record: Row): Unit = {
          val inTime = record(0).asInstanceOf[String].toLong
          val outTime = System.currentTimeMillis()
          reporter.report(inTime, outTime)
        }
      })
      .start()

    query.awaitTermination()
  }
}