/* * Modifications copyright (C) 2019 Cloudera Inc */ package com.cloudera.streaming.refapp.kudu import org.apache.kudu.spark.kudu.KuduContext import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.{DataFrame, SQLContext} import org.slf4j.LoggerFactory import scala.util.control.NonFatal object KuduSink { def withDefaultContext(sqlContext: SQLContext, parameters: Map[String, String]) = new KuduSink(new KuduContext(parameters("kudu.master"), sqlContext.sparkContext), parameters) } /** * A simple Structured Streaming sink which writes the data frame to Kudu. * It preserves exactly once semantics, as it's idempotent in the face of * multiple attempts to add the same batch. * * It uses the following parameters: * kudu.master - host:port pair of a kudu master node * kudu.table - full table name * checkpointLocation - where the checkpoint will be stored */ class KuduSink(initKuduContext: => KuduContext, parameters: Map[String, String]) extends Sink { private val logger = LoggerFactory.getLogger(getClass) private var kuduContext = initKuduContext private val tablename = parameters("kudu.table") private val retries = parameters.getOrElse("retries", "1").toInt require(retries >= 0, "retries must be non-negative") logger.info(s"Created Kudu sink writing to table $tablename") override def addBatch(batchId: Long, data: DataFrame): Unit = { for (attempt <- 0 to retries) { try { kuduContext.upsertRows(data, tablename) return } catch { case NonFatal(e) => if (attempt < retries) { logger.warn("Kudu upsert error, retrying...", e) kuduContext = initKuduContext } else { logger.error("Kudu upsert error, exhausted", e) throw e } } } } }