com.amazonaws.services.kinesis.model.Record Scala Examples
The following examples show how to use com.amazonaws.services.kinesis.model.Record.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: KinesisInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 2
Source File: Streaming.scala From scala-spark-cab-rides-predictions with MIT License | 5 votes |
import com.amazonaws.services.dynamodbv2.document.internal.InternalUtils import com.amazonaws.services.dynamodbv2.streamsadapter.model.RecordAdapter import com.amazonaws.services.kinesis.model.Record import com.google.gson.Gson import org.apache.spark.sql._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.kinesis.dynamostream.KinesisInitialPositions.Latest import org.apache.spark.streaming.kinesis.dynamostream.KinesisInputDStream import org.apache.spark.streaming.{Milliseconds, Seconds, StreamingContext} object Trials extends App { import org.apache.log4j.{Level, Logger} Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) //session setup System.setProperty("hadoop.home.dir", "C:\\winutils") val sparkSession = SparkSession.builder() .master("local[*]") .appName("test") .getOrCreate() val sc = sparkSession.sparkContext val ssc = new StreamingContext(sc, Seconds(10)) val sqlContext = sparkSession.sqlContext //creates an array of strings from raw byte array def kinesisRecordHandler: Record => Array[String] = (record: Record) => new String(record.getData.array()).split(",") //converts records to map of key value pair and then json def recordHandler = (record: Record) => { val gson = new Gson val sRecord = record.asInstanceOf[RecordAdapter].getInternalObject val map = InternalUtils.toSimpleMapValue(sRecord.getDynamodb.getNewImage) gson.toJson(map) } case class CabPrice(cab_type: String, product_id: String, name: String, price: String, distance: String, surge_multiplier: String, time_stamp: String, source: String, destination: String, id: String) val stream_cab = KinesisInputDStream.builder .streamingContext(ssc) .streamName("cab_rides") .regionName("us-east-1") .initialPosition(new Latest()) .checkpointAppName("cab_rides-app") .checkpointInterval(Milliseconds(1000)) .storageLevel(StorageLevel.MEMORY_AND_DISK_2) .buildWithMessageHandler(recordHandler) val stream_weather = KinesisInputDStream.builder .streamingContext(ssc) .streamName("weather") .regionName("us-east-1") .initialPosition(new Latest()) .checkpointAppName("cab_rides-app") .checkpointInterval(Milliseconds(1000)) .storageLevel(StorageLevel.MEMORY_AND_DISK_2) .buildWithMessageHandler(recordHandler) //creating dataframe, can be stored as temp view val cabSchema = Encoders.product[CabPrice].schema stream_cab.foreachRDD(rdd => { import sqlContext.implicits._ //val xx: Dataset[String] = rdd.toDS() val df: DataFrame = sqlContext.read.schema(cabSchema).json(rdd.toDS()) df.show() }) ssc.start() ssc.awaitTermination() }
Example 3
Source File: KinesisInputDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 4
Source File: KinesisInputDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 5
Source File: KinesisRecordToUnsafeRowConverter.scala From kinesis-sql with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kinesis import com.amazonaws.services.kinesis.model.Record import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.unsafe.types.UTF8String private[kinesis] class KinesisRecordToUnsafeRowConverter { private val rowWriter = new UnsafeRowWriter(5) def toUnsafeRow(record: Record, streamName: String): UnsafeRow = { rowWriter.reset() rowWriter.write(0, record.getData.array()) rowWriter.write(1, UTF8String.fromString(streamName)) rowWriter.write(2, UTF8String.fromString(record.getPartitionKey)) rowWriter.write(3, UTF8String.fromString(record.getSequenceNumber)) rowWriter.write(4, DateTimeUtils.fromJavaTimestamp( new java.sql.Timestamp(record.getApproximateArrivalTimestamp.getTime))) rowWriter.getRow } }
Example 6
Source File: KinesisInputDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo import org.apache.spark.streaming.{Duration, StreamingContext, Time} private[kinesis] class KinesisInputDStream[T: ClassTag]( @transient _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 7
Source File: RecordProcessor.scala From cave with MIT License | 5 votes |
package com.cave.metrics.data.kinesis import java.util.{List => JList} import com.amazonaws.services.kinesis.clientlibrary.interfaces.{IRecordProcessor, IRecordProcessorCheckpointer} import com.amazonaws.services.kinesis.clientlibrary.types.ShutdownReason import com.amazonaws.services.kinesis.model.Record import com.cave.metrics.data._ import org.apache.commons.logging.LogFactory import play.api.libs.json.Json import scala.collection.JavaConverters._ import scala.util.{Success, Try} class RecordProcessor(config: AwsConfig, sink: DataSink) extends IRecordProcessor with ExponentialBackOff { private[this] var shardId: String = _ private var nextCheckpointTimeMillis: Long = _ private[this] val log = LogFactory.getLog(classOf[RecordProcessor]) // Back off and retry settings for checkpoint override val MaxBackOffTimeInMillis = 10000L override val ShouldLogErrors: Boolean = true private val NumRetries = 10 private val CheckpointIntervalInMillis = 1000L override def initialize(shardId: String): Unit = { this.shardId = shardId } override def shutdown(check: IRecordProcessorCheckpointer, reason: ShutdownReason): Unit = { if (reason == ShutdownReason.TERMINATE) { checkpoint(check) } } override def processRecords(records: JList[Record], check: IRecordProcessorCheckpointer): Unit = { val metrics = (records.asScala map convert).filter(_.isSuccess) if (metrics.size == records.size()) { // all metrics successfully converted log.info(s"Received $metrics") sink.sendMetrics(for (Success(metric) <- metrics) yield metric) } else { log.error("Failed to parse records into Metric objects.") } if (System.currentTimeMillis() > nextCheckpointTimeMillis) { checkpoint(check) nextCheckpointTimeMillis = System.currentTimeMillis() + CheckpointIntervalInMillis } } private[this] def convert(record: Record): Try[Metric] = Try (Json.parse(new String(record.getData.array())).as[Metric]) private[this] def checkpoint(check: IRecordProcessorCheckpointer): Unit = { Try { retryUpTo(NumRetries) { check.checkpoint() } } recover { case e: Exception => log.warn(s"Failed to checkpoint shard $shardId: ${e.getMessage}") } } }