org.apache.avro.generic.GenericDatumReader Scala Examples

The following examples show how to use org.apache.avro.generic.GenericDatumReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: AvroTypeSpec.scala    From shapeless-datatype   with Apache License 2.0 5 votes vote down vote up
package shapeless.datatype.avro

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.net.URI
import java.nio.ByteBuffer

import com.google.protobuf.ByteString
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.joda.time.Instant
import org.scalacheck.Prop.forAll
import org.scalacheck.ScalacheckShapeless._
import org.scalacheck._
import shapeless._
import shapeless.datatype.record._

import scala.reflect.runtime.universe._

object AvroTypeSpec extends Properties("AvroType") {
  import shapeless.datatype.test.Records._
  import shapeless.datatype.test.SerializableUtils._

  implicit def compareByteArrays(x: Array[Byte], y: Array[Byte]) = java.util.Arrays.equals(x, y)
  implicit def compareIntArrays(x: Array[Int], y: Array[Int]) = java.util.Arrays.equals(x, y)

  def roundTrip[A: TypeTag, L <: HList](m: A)(implicit
    gen: LabelledGeneric.Aux[A, L],
    fromL: FromAvroRecord[L],
    toL: ToAvroRecord[L],
    mr: MatchRecord[L]
  ): Boolean = {
    val t = ensureSerializable(AvroType[A])
    val f1: SerializableFunction[A, GenericRecord] =
      new SerializableFunction[A, GenericRecord] {
        override def apply(m: A): GenericRecord = t.toGenericRecord(m)
      }
    val f2: SerializableFunction[GenericRecord, Option[A]] =
      new SerializableFunction[GenericRecord, Option[A]] {
        override def apply(m: GenericRecord): Option[A] = t.fromGenericRecord(m)
      }
    val toFn = ensureSerializable(f1)
    val fromFn = ensureSerializable(f2)
    val copy = fromFn(roundTripRecord(toFn(m)))
    val rm = RecordMatcher[A]
    copy.exists(rm(_, m))
  }

  def roundTripRecord(r: GenericRecord): GenericRecord = {
    val writer = new GenericDatumWriter[GenericRecord](r.getSchema)
    val baos = new ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(baos, null)
    writer.write(r, encoder)
    encoder.flush()
    baos.close()
    val bytes = baos.toByteArray

    val reader = new GenericDatumReader[GenericRecord](r.getSchema)
    val bais = new ByteArrayInputStream(bytes)
    val decoder = DecoderFactory.get().binaryDecoder(bais, null)
    reader.read(null, decoder)
  }

  implicit val byteStringAvroType = AvroType.at[ByteString](Schema.Type.BYTES)(
    v => ByteString.copyFrom(v.asInstanceOf[ByteBuffer]),
    v => ByteBuffer.wrap(v.toByteArray)
  )
  implicit val instantAvroType =
    AvroType.at[Instant](Schema.Type.LONG)(v => new Instant(v.asInstanceOf[Long]), _.getMillis)
  property("required") = forAll { m: Required => roundTrip(m) }
  property("optional") = forAll { m: Optional => roundTrip(m) }
  property("repeated") = forAll { m: Repeated => roundTrip(m) }
  property("mixed") = forAll { m: Mixed => roundTrip(m) }
  property("nested") = forAll { m: Nested => roundTrip(m) }
  property("seqs") = forAll { m: Seqs => roundTrip(m) }

  implicit val uriAvroType =
    AvroType.at[URI](Schema.Type.STRING)(v => URI.create(v.toString), _.toString)
  property("custom") = forAll { m: Custom => roundTrip(m) }
} 
Example 2
Source File: AvroConverter.scala    From kafka-connect-common   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.converters.source

import java.io.File
import java.util.Collections

import com.datamountaineer.streamreactor.connect.converters.MsgKey
import io.confluent.connect.avro.AvroData
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.io.DecoderFactory
import org.apache.avro.{Schema => AvroSchema}
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.kafka.connect.source.SourceRecord
import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException


class AvroConverter extends Converter {
  private val avroData = new AvroData(8)
  private var sourceToSchemaMap: Map[String, AvroSchema] = Map.empty
  private var avroReadersMap: Map[String, GenericDatumReader[GenericRecord]] = Map.empty

  override def convert(kafkaTopic: String,
                       sourceTopic: String,
                       messageId: String,
                       bytes: Array[Byte],
                       keys: Seq[String] = Seq.empty,
                       keyDelimiter: String = "."): SourceRecord = {
    Option(bytes) match {
      case None =>
        new SourceRecord(Collections.singletonMap(Converter.TopicKey, sourceTopic),
          null,
          kafkaTopic,
          avroData.toConnectSchema(sourceToSchemaMap(sourceTopic)),
          null)
      case Some(_) =>
        val reader = avroReadersMap.getOrElse(sourceTopic.toLowerCase, throw new ConfigException(s"Invalid ${AvroConverter.SCHEMA_CONFIG} is not configured for $sourceTopic"))
        val decoder = DecoderFactory.get().binaryDecoder(bytes, null)
        val record = reader.read(null, decoder)
        val schemaAndValue = avroData.toConnectData(sourceToSchemaMap(sourceTopic.toLowerCase), record)
        val value = schemaAndValue.value()
        value match {
          case s: Struct if keys.nonEmpty =>
            val keysValue = keys.flatMap { key =>
              Option(KeyExtractor.extract(s, key.split('.').toVector)).map(_.toString)
            }.mkString(keyDelimiter)
            new SourceRecord(
              Collections.singletonMap(Converter.TopicKey, sourceTopic),
              null,
              kafkaTopic,
              Schema.STRING_SCHEMA,
              keysValue,
              schemaAndValue.schema(),
              schemaAndValue.value())
          case _ =>
            new SourceRecord(
              Collections.singletonMap(Converter.TopicKey, sourceTopic),
              null,
              kafkaTopic,
              MsgKey.schema,
              MsgKey.getStruct(sourceTopic, messageId),
              schemaAndValue.schema(),
              schemaAndValue.value())
        }

    }
  }

  override def initialize(config: Map[String, String]): Unit = {
    sourceToSchemaMap = AvroConverter.getSchemas(config)
    avroReadersMap = sourceToSchemaMap.map { case (key, schema) =>
      key -> new GenericDatumReader[GenericRecord](schema)
    }
  }
}

object AvroConverter {
  val SCHEMA_CONFIG = "connect.source.converter.avro.schemas"

  def getSchemas(config: Map[String, String]): Map[String, AvroSchema] = {
    config.getOrElse(SCHEMA_CONFIG, throw new ConfigException(s"$SCHEMA_CONFIG is not provided"))
      .toString
      .split(';')
      .filter(_.trim.nonEmpty)
      .map(_.split("="))
      .map {
        case Array(source, path) =>
          val file = new File(path)
          if (!file.exists()) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The file $path doesn't exist!")
          }
          val s = source.trim.toLowerCase()
          if (s.isEmpty) {
            throw new ConfigException(s"Invalid $SCHEMA_CONFIG. The topic is not valid for entry containing $path")
          }
          s -> new AvroSchema.Parser().parse(file)
        case other => throw new ConfigException(s"$SCHEMA_CONFIG is not properly set. The format is Mqtt_Source->AVRO_FILE")
      }.toMap
  }
} 
Example 3
Source File: AvroSerializer.scala    From kafka-connect-common   with Apache License 2.0 5 votes vote down vote up
package com.datamountaineer.streamreactor.connect.serialization

import java.io.{ByteArrayOutputStream, InputStream, OutputStream}

import com.sksamuel.avro4s.{RecordFormat, SchemaFor}
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}

object AvroSerializer {
  def write[T <: Product](t: T)(implicit os: OutputStream, formatter: RecordFormat[T], schemaFor: SchemaFor[T]): Unit = write(apply(t), schemaFor())

  def write(record: GenericRecord, schema: Schema)(implicit os: OutputStream) = {
    val writer = new GenericDatumWriter[GenericRecord](schema)
    val encoder = EncoderFactory.get().binaryEncoder(os, null)

    writer.write(record, encoder)
    encoder.flush()
    os.flush()
  }

  def getBytes[T <: Product](t: T)(implicit recordFormat: RecordFormat[T], schemaFor: SchemaFor[T]): Array[Byte] = getBytes(recordFormat.to(t), schemaFor())

  def getBytes(record: GenericRecord, schema: Schema): Array[Byte] = {
    implicit val output = new ByteArrayOutputStream()
    write(record, schema)
    output.toByteArray
  }

  def read(is: InputStream, schema: Schema): GenericRecord = {
    val reader = new GenericDatumReader[GenericRecord](schema)
    val decoder = DecoderFactory.get().binaryDecoder(is, null)
    reader.read(null, decoder)
  }

  def read[T <: Product](is: InputStream)(implicit schemaFor: SchemaFor[T], recordFormat: RecordFormat[T]): T = recordFormat.from(read(is, schemaFor()))

  def apply[T <: Product](t: T)(implicit formatter: RecordFormat[T]): GenericRecord = formatter.to(t)
} 
Example 4
Source File: ProtobufUtilTest.scala    From scio   with Apache License 2.0 5 votes vote down vote up
package com.spotify.scio.util

import java.io.File
import java.nio.channels.Channels
import java.nio.file.Files

import com.spotify.scio.ScioContext
import com.spotify.scio.avro._
import com.spotify.scio.coders.Coder
import com.spotify.scio.proto.Track.TrackPB
import org.apache.avro.file.DataFileStream
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.beam.sdk.io.{FileSystems, LocalResources}
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.should.Matchers

import scala.jdk.CollectionConverters._

class ProtobufUtilTest extends AnyFlatSpec with Matchers {

  "ProtobufUtil" should "convert Message -> GenericRecords that can be written and read" in {
    val sc = ScioContext()

    val dir = Files.createTempDirectory("protobuf-util-")
    val (path1, path2) = (new File(s"$dir/1"), new File(s"$dir/2"))
    path1.deleteOnExit()
    path2.deleteOnExit()
    dir.toFile.deleteOnExit()

    implicit val grCoder: Coder[GenericRecord] = ProtobufUtil.AvroMessageCoder

    val messages = sc
      .parallelize(1 to 10)
      .map(i => TrackPB.newBuilder().setTrackId(i.toString).build())

    messages
      .map(ProtobufUtil.toAvro[TrackPB])
      .saveAsAvroFile(
        path1.getPath,
        suffix = ".protobuf",
        metadata = ProtobufUtil.schemaMetadataOf[TrackPB],
        schema = ProtobufUtil.AvroMessageSchema,
        numShards = 1
      )

    val protoWriteTap = messages.saveAsProtobufFile(path2.getPath, numShards = 1)

    val result = sc.run().waitUntilDone()

    val (tapFromAvroWrite, tapFromProtoWrite) = (
      ObjectFileTap[TrackPB](ScioUtil.addPartSuffix(path1.getPath)),
      protoWriteTap.get(result)
    )

    tapFromAvroWrite.value.toList should contain theSameElementsAs tapFromProtoWrite.value.toList
    getMetadata(path1) should contain theSameElementsAs getMetadata(path2)
  }

  private def getMetadata(dir: File): Map[String, AnyRef] = {
    val files = dir.listFiles()
    if (files.length != 1) {
      fail(s"Directory $dir should contain 1 Avro file. Instead, found ${files.toList}")
    }

    val dfs = new DataFileStream[GenericRecord](
      Channels.newInputStream(FileSystems.open(LocalResources.fromFile(files(0), false))),
      new GenericDatumReader[GenericRecord]
    )

    dfs.getMetaKeys.asScala.map(k => (k, dfs.getMetaString(k))).toMap
  }
} 
Example 5
Source File: Sedes.scala    From shc   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.hbase

import java.io.ByteArrayInputStream

import org.apache.avro.Schema
import org.apache.avro.Schema.Type._
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io._
import org.apache.commons.io.output.ByteArrayOutputStream
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.sql.types._

trait Sedes {
  def serialize(value: Any): Array[Byte]
  def deserialize(bytes: Array[Byte], start: Int, end: Int): Any
}

class DoubleSedes extends Sedes {
  override def serialize(value: Any): Array[Byte] = Bytes.toBytes(value.asInstanceOf[Double])
  override def deserialize(bytes: Array[Byte], start: Int, end: Int): Any = {
    Bytes.toLong(bytes, start)
  }
} 
Example 6
Source File: SparkAvroDecoder.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark.avro

import org.apache.log4j.Logger

import java.io.ByteArrayOutputStream

import scala.reflect.runtime.universe._

import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord }
import org.apache.avro.io.{ DecoderFactory, EncoderFactory }
import org.apache.spark.sql.{ Dataset, Encoder, Row }
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder }
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types.StructType
import org.apache.avro.Schema

import cloudflow.spark.sql.SQLImplicits._

case class EncodedKV(key: String, value: Array[Byte])

case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) {

  val encoder: Encoder[T]                           = implicitly[Encoder[T]]
  val sqlSchema: StructType                         = encoder.schema
  val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema)
  @transient lazy val _avroSchema                   = new Schema.Parser().parse(avroSchema)
  @transient lazy val rowConverter                  = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema)
  @transient lazy val datumReader                   = new GenericDatumReader[GenericRecord](_avroSchema)
  @transient lazy val decoder                       = DecoderFactory.get
  def decode(bytes: Array[Byte]): Row = {
    val binaryDecoder = decoder.binaryDecoder(bytes, null)
    val record        = datumReader.read(null, binaryDecoder)
    rowConverter(record).asInstanceOf[GenericRow]
  }

}


case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) {

  @transient lazy val log = Logger.getLogger(getClass.getName)

  val BufferSize = 5 * 1024 // 5 Kb

  val encoder                     = implicitly[Encoder[T]]
  val sqlSchema                   = encoder.schema
  @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema)

  val recordName                = "topLevelRecord" // ???
  val recordNamespace           = "recordNamespace" // ???
  @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace)

  // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage
  def rowToBytes(row: Row): Array[Byte] = {
    val genRecord = converter(row).asInstanceOf[GenericRecord]
    if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord")
    val datumWriter   = new GenericDatumWriter[GenericRecord](_avroSchema)
    val avroEncoder   = EncoderFactory.get
    val byteArrOS     = new ByteArrayOutputStream(BufferSize)
    val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null)
    datumWriter.write(genRecord, binaryEncoder)
    binaryEncoder.flush()
    byteArrOS.toByteArray
  }

  def encode(dataset: Dataset[T]): Dataset[Array[Byte]] =
    dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]]

  // Note to self: I'm not sure how heavy this chain of transformations is
  def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = {
    val encoder             = encoderFor[T]
    implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind()
    dataset.map { value ⇒
      val key         = keyFun(value)
      val internalRow = encoder.toRow(value)
      val row         = rowEncoder.fromRow(internalRow)
      val bytes       = rowToBytes(row)
      EncodedKV(key, bytes)
    }
  }

} 
Example 7
Source File: Decoding.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package benchmarks

import java.io.ByteArrayOutputStream
import java.nio.ByteBuffer
import java.util.Collections

import benchmarks.record._
import com.sksamuel.avro4s._
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.util.ByteBufferInputStream
import org.openjdk.jmh.annotations._
import org.openjdk.jmh.infra.Blackhole

object Decoding extends BenchmarkHelpers {
  @State(Scope.Thread)
  class Setup {
    val avroBytes = {
      import benchmarks.record.generated.AttributeValue._
      import benchmarks.record.generated._
      new RecordWithUnionAndTypeField(new ValidInt(255, t)).toByteBuffer
    }

    val avro4sBytes = encode(RecordWithUnionAndTypeField(AttributeValue.Valid[Int](255, t)))

    val (handrolledDecoder, handrolledReader) = {
      import benchmarks.handrolled_codecs._
      implicit val codec: Codec[AttributeValue[Int]] = AttributeValueCodec[Int]
      implicit val schemaFor: SchemaFor[AttributeValue[Int]] = SchemaFor[AttributeValue[Int]](codec.schema)
      val recordSchemaFor = SchemaFor[RecordWithUnionAndTypeField]
      val decoder = Decoder[RecordWithUnionAndTypeField].withSchema(recordSchemaFor)
      val reader = new GenericDatumReader[GenericRecord](recordSchemaFor.schema)
      (decoder, reader)
    }

    val (avro4sDecoder, avro4sReader) = {
      val decoder = Decoder[RecordWithUnionAndTypeField]
      val reader = new GenericDatumReader[GenericRecord](decoder.schema)
      (decoder, reader)
    }
  }

  def encode[T: Encoder: SchemaFor](value: T): ByteBuffer = {
    val outputStream = new ByteArrayOutputStream(512)
    val encoder = Encoder[T]
    val schema = AvroSchema[T]
    val record = encoder.encode(value).asInstanceOf[GenericRecord]
    val writer = new GenericDatumWriter[GenericRecord](schema)
    val enc = EncoderFactory.get().directBinaryEncoder(outputStream, null)
    writer.write(record, enc)
    ByteBuffer.wrap(outputStream.toByteArray)
  }
}

class Decoding extends CommonParams with BenchmarkHelpers {

  import Decoding._

  def decode[T](bytes: ByteBuffer, decoder: Decoder[T], reader: GenericDatumReader[GenericRecord]): T = {
    val dec =
      DecoderFactory.get().binaryDecoder(new ByteBufferInputStream(Collections.singletonList(bytes.duplicate)), null)
    val record = reader.read(null, dec)
    decoder.decode(record)
  }


  @Benchmark
  def avroSpecificRecord(setup: Setup, blackhole: Blackhole) = {
    import benchmarks.record.generated._
    blackhole.consume(RecordWithUnionAndTypeField.fromByteBuffer(setup.avroBytes.duplicate))
  }

  @Benchmark
  def avro4sHandrolled(setup: Setup, blackhole: Blackhole) =
    blackhole.consume(decode(setup.avro4sBytes, setup.handrolledDecoder, setup.handrolledReader))

  @Benchmark
  def avro4sGenerated(setup: Setup, blackhole: Blackhole) =
    blackhole.consume(decode(setup.avro4sBytes, setup.avro4sDecoder, setup.avro4sReader))
} 
Example 8
Source File: OutputStreamTest.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.streams.output

import java.io.ByteArrayOutputStream

import com.sksamuel.avro4s._
import org.apache.avro.file.{DataFileReader, SeekableByteArrayInput}
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.io.DecoderFactory
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

trait OutputStreamTest extends AnyFunSuite with Matchers {

  def readData[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readData(out.toByteArray)
  def readData[T: SchemaFor](bytes: Array[Byte]): GenericRecord = {
    val datumReader = new GenericDatumReader[GenericRecord](AvroSchema[T])
    val dataFileReader = new DataFileReader[GenericRecord](new SeekableByteArrayInput(bytes), datumReader)
    dataFileReader.next
  }

  def writeData[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = {
    val out = new ByteArrayOutputStream
    val avro = AvroOutputStream.data[T].to(out).build()
    avro.write(t)
    avro.close()
    out
  }

  def readBinary[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readBinary(out.toByteArray)
  def readBinary[T: SchemaFor](bytes: Array[Byte]): GenericRecord = {
    val datumReader = new GenericDatumReader[GenericRecord](AvroSchema[T])
    val decoder = DecoderFactory.get().binaryDecoder(new SeekableByteArrayInput(bytes), null)
    datumReader.read(null, decoder)
  }

  def writeBinary[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = {
    val out = new ByteArrayOutputStream
    val avro = AvroOutputStream.binary[T].to(out).build()
    avro.write(t)
    avro.close()
    out
  }

  def readJson[T: SchemaFor](out: ByteArrayOutputStream): GenericRecord = readJson(out.toByteArray)
  def readJson[T: SchemaFor](bytes: Array[Byte]): GenericRecord = {
    val schema = AvroSchema[T]
    val datumReader = new GenericDatumReader[GenericRecord](schema)
    val decoder = DecoderFactory.get().jsonDecoder(schema, new SeekableByteArrayInput(bytes))
    datumReader.read(null, decoder)
  }

  def writeJson[T: Encoder : SchemaFor](t: T): ByteArrayOutputStream = {
    val out = new ByteArrayOutputStream
    val avro = AvroOutputStream.json[T].to(out).build()
    avro.write(t)
    avro.close()
    out
  }

  def writeRead[T: Encoder : SchemaFor](t: T)(fn: GenericRecord => Any): Unit = {
    {
      val out = writeData(t)
      val record = readData(out)
      fn(record)
    }
    {
      val out = writeBinary(t)
      val record = readBinary(out)
      fn(record)
    }
    {
      val out = writeJson(t)
      val record = readJson(out)
      fn(record)
    }
  }
} 
Example 9
Source File: GithubIssue235.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.github

import java.io.ByteArrayOutputStream

import com.sksamuel.avro4s.{Decoder, Encoder, RecordFormat, SchemaFor}
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

case class Label(value: String) extends AnyVal
case class Value[A](label: Label, value: A)

sealed trait OneOrTwo[A]
case class One[A](value: Value[A]) extends OneOrTwo[A]
case class Two[A](first: Value[A], second: Value[A]) extends OneOrTwo[A]
case class OneOrTwoWrapper[A](t: OneOrTwo[A])

object Bug {

  def apply[T <: Product](a: T)(
    implicit schemaFor: SchemaFor[T],
    encoder: Encoder[T],
    decoder: Decoder[T]
  ): Unit = {

    val format = RecordFormat[T]
    val schema = schemaFor.schema
    val datumReader = new GenericDatumReader[GenericRecord](schema)
    val datumWriter = new GenericDatumWriter[GenericRecord](schema)

    val stream = new ByteArrayOutputStream()
    val bEncoder = EncoderFactory.get().binaryEncoder(stream, null)

    datumWriter.write(format.to(a), bEncoder)
    bEncoder.flush()

    val bytes = stream.toByteArray
    val bDecoder = DecoderFactory.get().binaryDecoder(bytes, null)
    val record = datumReader.read(null, bDecoder)
    require(format.from(record) == a)
  }

}

class GithubIssue235 extends AnyFunSuite with Matchers {
  test("Broken typeclass derivation upgrading from 1.9.0 to 2.0.1 #235") {
    val o = OneOrTwoWrapper(One(Value(Label("lbl"), "foo")))
    Bug(o)
  }
} 
Example 10
Source File: GithubIssue191.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s.github

import java.io.ByteArrayOutputStream

import com.sksamuel.avro4s.{AvroOutputStream, AvroSchema}
import org.apache.avro.file.{DataFileReader, SeekableByteArrayInput}
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.util.Utf8
import org.scalatest.funsuite.AnyFunSuite
import org.scalatest.matchers.should.Matchers

final case class SN(value: String) extends AnyVal
final case class SimpleUser(name: String, sn: Option[SN])

class GithubIssue191 extends AnyFunSuite with Matchers {

  test("writing out AnyVal in an option") {
    implicit val schema = AvroSchema[SimpleUser]
    val bytes = new ByteArrayOutputStream
    val out = AvroOutputStream.data[SimpleUser].to(bytes).build()
    out.write(SimpleUser("Tom", Some(SN("123"))))
    out.close()

    val datumReader = new GenericDatumReader[GenericRecord](schema)
    val dataFileReader = new DataFileReader[GenericRecord](new SeekableByteArrayInput(bytes.toByteArray), datumReader)
    val record = new Iterator[GenericRecord] {
      override def hasNext: Boolean = dataFileReader.hasNext
      override def next(): GenericRecord = dataFileReader.next
    }.toList.head
    record.getSchema shouldBe schema
    record.get("name") shouldBe new Utf8("Tom")
    record.get("sn") shouldBe new Utf8("123")
  }
} 
Example 11
Source File: DefaultAwareDatumReader.scala    From avro4s   with Apache License 2.0 5 votes vote down vote up
package com.sksamuel.avro4s

import org.apache.avro.generic.GenericDatumReader
import org.apache.avro.io.ResolvingDecoder
import org.apache.avro.{AvroTypeException, Schema}

class DefaultAwareDatumReader[T](writer: Schema, reader: Schema)
  extends GenericDatumReader[T](writer, reader, new DefaultAwareGenericData) {
  override def readField(r: scala.Any,
                         f: Schema.Field,
                         oldDatum: scala.Any,
                         in: ResolvingDecoder,
                         state: scala.Any): Unit = {
    try {
      super.readField(r, f, oldDatum, in, state)
    } catch {
      case t: AvroTypeException =>
        if (f.defaultVal == null) throw t else getData.setField(r, f.name, f.pos, f.defaultVal)
    }
  }
}

object DefaultAwareDatumReader {
  def apply[T](writerSchema: Schema): DefaultAwareDatumReader[T] = new DefaultAwareDatumReader[T](writerSchema, writerSchema)
} 
Example 12
Source File: AvroIO.scala    From ratatool   with Apache License 2.0 5 votes vote down vote up
package com.spotify.ratatool.io

import java.io.{File, InputStream, OutputStream}
import java.nio.ByteBuffer
import java.nio.channels.SeekableByteChannel

import com.google.common.io.ByteStreams
import org.apache.avro.Schema
import org.apache.avro.file.{DataFileReader, DataFileWriter, SeekableByteArrayInput, SeekableInput}
import org.apache.avro.generic.{GenericDatumReader, GenericDatumWriter, GenericRecord}
import org.apache.avro.io.{DatumReader, DatumWriter}
import org.apache.avro.reflect.{ReflectDatumReader, ReflectDatumWriter}
import org.apache.avro.specific.{SpecificDatumReader, SpecificDatumWriter, SpecificRecord}
import org.apache.beam.sdk.io.FileSystems
import org.apache.beam.sdk.io.fs.MatchResult.Metadata

import scala.jdk.CollectionConverters._
import scala.reflect.ClassTag


  def writeToOutputStream[T: ClassTag](data: Iterable[T],
                                       schema: Schema,
                                       os: OutputStream): Unit = {
    val fileWriter = new DataFileWriter(createDatumWriter[T]).create(schema, os)
    data.foreach(fileWriter.append)
    fileWriter.close()
  }

  def getAvroSchemaFromFile(path: String): Schema = {
    require(FileStorage(path).exists, s"File `$path` does not exist!")
    val files = FileStorage(path).listFiles.filter(_.resourceId.getFilename.endsWith(".avro"))
    require(files.nonEmpty, s"File `$path` does not contain avro files")
    val reader = new GenericDatumReader[GenericRecord]()
    val dfr = new DataFileReader[GenericRecord](AvroIO.getAvroSeekableInput(files.head), reader)
    dfr.getSchema
  }

  private def getAvroSeekableInput(meta: Metadata): SeekableInput = new SeekableInput {
    require(meta.isReadSeekEfficient)
    private val in = FileSystems.open(meta.resourceId()).asInstanceOf[SeekableByteChannel]
    override def read(b: Array[Byte], off: Int, len: Int): Int =
      in.read(ByteBuffer.wrap(b, off, len))
    override def tell(): Long = in.position()
    override def length(): Long = in.size()
    override def seek(p: Long): Unit = in.position(p)
    override def close(): Unit = in.close()
  }

} 
Example 13
Source File: AvroDecoder.scala    From cuesheet   with Apache License 2.0 5 votes vote down vote up
package com.kakao.cuesheet.convert

import java.util.Arrays.copyOfRange

import kafka.serializer.Decoder
import kafka.utils.VerifiableProperties
import org.apache.avro.Schema
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}


sealed trait AvroDecoder[T] extends Decoder[T] {

  def props: VerifiableProperties

  protected val schema = new Schema.Parser().parse(props.getString(Avro.SCHEMA))
  protected val skipBytes = props.getInt(Avro.SKIP_BYTES, 0)

  protected val reader = new GenericDatumReader[GenericRecord](schema)
  protected val decoder = Avro.recordDecoder(reader)

  private def skip(bytes: Array[Byte], size: Int): Array[Byte] = {
    val length = bytes.length
    length - size match {
      case remaining if remaining > 0 => copyOfRange(bytes, size, length)
      case _ => new Array[Byte](0)
    }
  }

  def parse(bytes: Array[Byte]): GenericRecord = {
    val data = if (skipBytes == 0) bytes else skip(bytes, skipBytes)
    decoder(data)
  }
}

class AvroRecordDecoder(val props: VerifiableProperties) extends AvroDecoder[GenericRecord] {
  override def fromBytes(bytes: Array[Byte]): GenericRecord = parse(bytes)
}

class AvroMapDecoder(val props: VerifiableProperties) extends AvroDecoder[Map[String, Any]] {
  override def fromBytes(bytes: Array[Byte]): Map[String, Any] = Avro.toMap(parse(bytes))
}

class AvroJsonDecoder(val props: VerifiableProperties) extends AvroDecoder[String] {
  override def fromBytes(bytes: Array[Byte]): String = Avro.toJson(parse(bytes))
} 
Example 14
Source File: AvroSEBasicTest.scala    From akka-serialization-test   with Apache License 2.0 5 votes vote down vote up
package com.github.dnvriend.serializer.avro4s

import com.github.dnvriend.TestSpec
import com.github.dnvriend.domain.BookStore.{ ChangedBookV1, ChangedBookV2, ChangedBookV3, ChangedBookV4 }
import com.github.dnvriend.serializer.avro.{ BookSerializerV1, BookSerializerV2, BookSerializerV3 }
import com.sksamuel.avro4s.{ AvroSchema, RecordFormat }
import org.apache.avro.Schema
import org.apache.avro.file.SeekableByteArrayInput
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord }
import org.apache.avro.io.DecoderFactory

// SE stands for Schema Evolution
class AvroSEBasicTest extends TestSpec {

  @Override
  def fromBytes(bytes: Array[Byte], schema: Schema): GenericRecord = {
    val serveReader = new GenericDatumReader[GenericRecord](schema)
    serveReader.read(null, DecoderFactory.get().binaryDecoder(bytes, null))
  }

  val title = "Moby-Dick; or, The Whale"
  val year = 1851
  val editor = "Scala Books"

  "AvroSEBasicTest" should "deserialize old class with renamed field" in {
    // in this case, two different serializers can be used

    val obj = ChangedBookV1(title, year)
    val serializerV1 = new BookSerializerV1
    val bytes: Array[Byte] = serializerV1.toBinary(obj)
    val serializerV2 = new BookSerializerV2

    serializerV2.fromBinary(bytes) should matchPattern {
      case ChangedBookV2(`title`, `year`) ⇒
    }
  }

  it should "deserialize old class without new field" in {

    val obj = ChangedBookV2(title, year)
    val serializerV2 = new BookSerializerV2
    val bytes: Array[Byte] = serializerV2.toBinary(obj)

    val in = new SeekableByteArrayInput(bytes)

    val schema2 = AvroSchema[ChangedBookV2]
    val schema3 = AvroSchema[ChangedBookV3]

    val gdr = new GenericDatumReader[GenericRecord](schema2, schema3)
    val binDecoder = DecoderFactory.get().binaryDecoder(in, null)
    val record: GenericRecord = gdr.read(null, binDecoder)
    val format = RecordFormat[ChangedBookV3]
    val r = format.from(record)

    r should matchPattern {
      case ChangedBookV3(`title`, `year`, "") ⇒
    }

  }

  it should "deserialize old class with dropped field" in {

    val obj = ChangedBookV3(title, year, editor)
    val serializerV3 = new BookSerializerV3
    val bytes: Array[Byte] = serializerV3.toBinary(obj)

    val in = new SeekableByteArrayInput(bytes)

    val schema3 = AvroSchema[ChangedBookV3]
    val schema4 = AvroSchema[ChangedBookV4]

    val gdr = new GenericDatumReader[GenericRecord](schema3, schema4)
    val binDecoder = DecoderFactory.get().binaryDecoder(in, null)
    val record: GenericRecord = gdr.read(null, binDecoder)
    val format = RecordFormat[ChangedBookV4]
    val r = format.from(record)

    r should matchPattern {
      case ChangedBookV4(`title`, `editor`) ⇒
    }

  }

} 
Example 15
Source File: DefaultFrameReader.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.avro

import java.nio.charset.Charset

import org.apache.avro.file.{DataFileReader, SeekableByteArrayInput}
import org.apache.avro.generic.{GenericData, GenericDatumReader}
import SchemaConverter._
import ml.combust.mleap.runtime.serialization.{BuiltinFormats, FrameReader}
import ml.combust.mleap.core.types.StructType
import ml.combust.mleap.runtime.frame.{ArrayRow, DefaultLeapFrame, Row}

import scala.collection.mutable
import scala.util.Try


class DefaultFrameReader extends FrameReader {
  val valueConverter = ValueConverter()

  override def fromBytes(bytes: Array[Byte], charset: Charset = BuiltinFormats.charset): Try[DefaultLeapFrame] = Try {
    val datumReader = new GenericDatumReader[GenericData.Record]()
    val reader = new DataFileReader[GenericData.Record](new SeekableByteArrayInput(bytes), datumReader)
    val avroSchema = reader.getSchema
    val schema = avroSchema: StructType
    val readers = schema.fields.map(_.dataType).map(valueConverter.avroToMleap)

    var record = new GenericData.Record(avroSchema)
    var rows = mutable.Seq[Row]()
    while(Try(reader.hasNext).getOrElse(false)) {
      record = reader.next(record)
      val row = ArrayRow(new Array[Any](schema.fields.length))
      for(i <- schema.fields.indices) { row.set(i, readers(i)(record.get(i))) }
      rows :+= row
    }

    DefaultLeapFrame(schema, rows)
  }
} 
Example 16
Source File: DefaultRowReader.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.avro

import java.nio.charset.Charset

import org.apache.avro.Schema
import org.apache.avro.generic.{GenericData, GenericDatumReader}
import org.apache.avro.io.{BinaryDecoder, DecoderFactory}
import SchemaConverter._
import ml.combust.mleap.runtime.serialization.{BuiltinFormats, RowReader}
import ml.combust.mleap.core.types.StructType
import ml.combust.mleap.runtime.frame.{ArrayRow, Row}

import scala.util.Try


class DefaultRowReader(override val schema: StructType) extends RowReader {
  val valueConverter = ValueConverter()
  lazy val readers = schema.fields.map(_.dataType).map(valueConverter.avroToMleap)
  val avroSchema = schema: Schema
  val datumReader = new GenericDatumReader[GenericData.Record](avroSchema)
  var decoder: BinaryDecoder = null
  var record = new GenericData.Record(avroSchema)

  override def fromBytes(bytes: Array[Byte], charset: Charset = BuiltinFormats.charset): Try[Row] = Try {
    decoder = DecoderFactory.get().binaryDecoder(bytes, decoder)
    record = datumReader.read(record, decoder)
    val row = ArrayRow(new Array[Any](schema.fields.length))
    for(i <- schema.fields.indices) { row.set(i, readers(i)(record.get(i))) }
    row
  }
} 
Example 17
Source File: SpecificTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

} 
Example 18
Source File: SpecificDefautValuesSpec.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
import test._
import org.specs2.mutable.Specification
import java.io.File
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.file.DataFileReader

import DefaultEnum._

class SpecificDefaultValuesSpec extends Specification {

  "A case class with default values" should {
    "deserialize correctly" in {
      val record = DefaultTest()
      val records = List(record)
      
      val fileName = s"${records.head.getClass.getName}"
      val fileEnding = "avro"
      val file = File.createTempFile(fileName, fileEnding)
      file.deleteOnExit()
      SpecificTestUtil.write(file, records)
      
      val dummyRecord = new GenericDatumReader[GenericRecord]
      val schema = new DataFileReader(file, dummyRecord).getSchema
      val userDatumReader = new SpecificDatumReader[DefaultTest](schema)
      val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader)
      val sameRecord = dataFileReader.next

      sameRecord.suit === SPADES
      sameRecord.number === 0
      sameRecord.str === "str"
      sameRecord.optionString === None
      sameRecord.optionStringValue === Some("default")
      sameRecord.embedded === Embedded(1)
      sameRecord.defaultArray === Vector(1,3,4,5)
      sameRecord.optionalEnum === None
      sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas")
      sameRecord.byt === "\u00FF".getBytes
    }
  }

} 
Example 19
Source File: SpecificTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord.equals(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

} 
Example 20
Source File: SpecificDefautValuesSpec.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
import test._
import org.specs2.mutable.Specification
import java.io.File
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.file.DataFileReader

import DefaultEnum._

class SpecificDefaultValuesSpec extends Specification {

  "A case class with default values" should {
    "deserialize correctly" in {
      val record = DefaultTest()
      val records = List(record)
      
      val fileName = s"${records.head.getClass.getName}"
      val fileEnding = "avro"
      val file = File.createTempFile(fileName, fileEnding)
      file.deleteOnExit()
      SpecificTestUtil.write(file, records)
      
      val dummyRecord = new GenericDatumReader[GenericRecord]
      val schema = new DataFileReader(file, dummyRecord).getSchema
      val userDatumReader = new SpecificDatumReader[DefaultTest](schema)
      val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader)
      val sameRecord = dataFileReader.next

      sameRecord.suit === SPADES
      sameRecord.number === 0
      sameRecord.str === "str"
      sameRecord.optionString === None
      sameRecord.optionStringValue === Some("default")
      sameRecord.embedded === Embedded(1)
      sameRecord.defaultArray === List(1,3,4,5)
      sameRecord.optionalEnum === None
      sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas")
      sameRecord.byt === "\u00FF".getBytes
    }
  }

} 
Example 21
Source File: SpecificTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]()
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file)
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close()
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

} 
Example 22
Source File: SpecificDefautValuesSpec.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
import test._
import org.specs2.mutable.Specification
import java.io.File
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.file.DataFileReader

import DefaultEnum._

class SpecificDefaultValuesSpec extends Specification {

  "A case class with default values" should {
    "deserialize correctly" in {
      val record = DefaultTest()
      val records = List(record)
      
      val fileName = s"${records.head.getClass.getName}"
      val fileEnding = "avro"
      val file = File.createTempFile(fileName, fileEnding)
      file.deleteOnExit()
      SpecificTestUtil.write(file, records)
      
      val dummyRecord = new GenericDatumReader[GenericRecord]
      val schema = new DataFileReader(file, dummyRecord).getSchema
      val userDatumReader = new SpecificDatumReader[DefaultTest](schema)
      val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader)
      val sameRecord = dataFileReader.next

      sameRecord.suit === SPADES
      sameRecord.number === 0
      sameRecord.str === "str"
      sameRecord.optionString === None
      sameRecord.optionStringValue === Some("default")
      sameRecord.embedded === Embedded(1)
      sameRecord.defaultArray === List(1,3,4,5)
      sameRecord.optionalEnum === None
      sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas")
      sameRecord.byt === "\u00FF".getBytes
    }
  }

} 
Example 23
Source File: SpecificTestUtil.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
package test

import java.io.File

import org.apache.avro.io.{DecoderFactory, EncoderFactory}
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.Schema
import org.apache.avro.file.{ DataFileReader, DataFileWriter }

import org.specs2.mutable.Specification

object SpecificTestUtil extends Specification {

  def write[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val userDatumWriter = new SpecificDatumWriter[T]
    val dataFileWriter = new DataFileWriter[T](userDatumWriter)
    dataFileWriter.create(records.head.getSchema, file);
    records.foreach(record => dataFileWriter.append(record))
    dataFileWriter.close();
  }

  def read[T <: SpecificRecordBase](file: File, records: List[T]) = {
    val dummyRecord = new GenericDatumReader[GenericRecord]
    val schema = new DataFileReader(file, dummyRecord).getSchema
    val userDatumReader = new SpecificDatumReader[T](schema)
    val dataFileReader = new DataFileReader[T](file, userDatumReader)
    // Adapted from: https://github.com/tackley/avrohugger-list-issue/blob/master/src/main/scala/net/tackley/Reader.scala
    // This isn't great scala, but represents how org.apache.avro.mapred.AvroInputFormat
    // (via org.apache.avro.file.DataFileStream) interacts with the SpecificDatumReader.
    var record: T = null.asInstanceOf[T]
    var sameRecord: T = null.asInstanceOf[T]
    val recordIter = records.iterator
    while (dataFileReader.hasNext) {
      sameRecord = dataFileReader.next(sameRecord)
      record = recordIter.next
    }
    dataFileReader.close()
    sameRecord must ===(record)
  }

  def verifyWriteAndRead[T <: SpecificRecordBase](records: List[T]) = {
    val fileName = s"${records.head.getClass.getName}"
    val fileEnding = "avro"
    val file = File.createTempFile(fileName, fileEnding)
    file.deleteOnExit()
    write(file, records)
    read(file, records)
  }

  def verifyEncodeDecode[T <: SpecificRecordBase](record: T) = {
    val schema = record.getSchema
    val writer = new SpecificDatumWriter[T](schema)
    val out = new java.io.ByteArrayOutputStream()
    val encoder = EncoderFactory.get().binaryEncoder(out, null)
    writer.write(record, encoder)
    encoder.flush
    val ba = out.toByteArray
    ba.size must ===(1)
    ba(0) must ===(0)
    out.close
    val reader = new SpecificDatumReader[T](schema)
    val decoder = DecoderFactory.get().binaryDecoder(ba, null)
    val decoded = reader.read(record, decoder)
    decoded must ===(record)
  }

} 
Example 24
Source File: SpecificDefautValuesSpec.scala    From sbt-avrohugger   with Apache License 2.0 5 votes vote down vote up
import test._
import org.specs2.mutable.Specification
import java.io.File
import org.apache.avro.generic.{ GenericDatumReader, GenericRecord}
import org.apache.avro.specific.{
  SpecificDatumReader,
  SpecificDatumWriter,
  SpecificRecordBase
}
import org.apache.avro.file.DataFileReader

class SpecificDefaultValuesSpec extends Specification {

  "A case class with default values" should {
    "deserialize correctly" in {
      val record = DefaultTest()
      val records = List(record)
      
      val fileName = s"${records.head.getClass.getName}"
      val fileEnding = "avro"
      val file = File.createTempFile(fileName, fileEnding)
      file.deleteOnExit()
      SpecificTestUtil.write(file, records)
      
      val dummyRecord = new GenericDatumReader[GenericRecord]
      val schema = new DataFileReader(file, dummyRecord).getSchema
      val userDatumReader = new SpecificDatumReader[DefaultTest](schema)
      val dataFileReader = new DataFileReader[DefaultTest](file, userDatumReader)
      val sameRecord = dataFileReader.next

      sameRecord.suit === "SPADES"
      sameRecord.number === 0
      sameRecord.str === "str"
      sameRecord.optionString === None
      sameRecord.optionStringValue === Some("default")
      sameRecord.embedded === Embedded(1)
      sameRecord.defaultArray === Array(1,3,4,5)
      sameRecord.optionalEnum === None
      sameRecord.defaultMap === Map("Hello" -> "world", "Merry" -> "Christmas")
      sameRecord.byt === "\u00FF".getBytes
    }
  }

} 
Example 25
Source File: StringToGenericRecord.scala    From hydra   with Apache License 2.0 5 votes vote down vote up
package hydra.avro.convert

import java.util.UUID

import org.apache.avro.{LogicalTypes, Schema}
import org.apache.avro.generic.{GenericDatumReader, GenericRecord}
import org.apache.avro.io.DecoderFactory
import cats.implicits._
import org.apache.avro.util.Utf8

import scala.util.{Failure, Success, Try}

object StringToGenericRecord {

  final case class ValidationExtraFieldsError(fields: Set[String]) extends RuntimeException(
    s"Extra fields ${fields.mkString(",")} found with Strict Validation Strategy"
  )

  final case class InvalidLogicalTypeError(expected: String, received: AnyRef) extends RuntimeException(
    s"Invalid logical type. Expected $expected but received $received"
  )

  implicit class ConvertToGenericRecord(s: String) {

    private def isUuidValid(s: String): Boolean =
      Try(UUID.fromString(s)).isSuccess

    private def checkLogicalTypes(record: GenericRecord): Try[Unit] = {
      import collection.JavaConverters._
      def checkAll(avroField: AnyRef, fieldSchema: Option[Schema]): Try[Unit] = avroField match {
        case g: GenericRecord => g.getSchema.getFields.asScala.toList
          .traverse(f => checkAll(g.get(f.name), f.schema.some)).void
        case u: Utf8 if fieldSchema.exists(f => Option(f.getLogicalType).exists(_.getName == LogicalTypes.uuid.getName)) =>
          if (isUuidValid(u.toString)) Success(()) else Failure(InvalidLogicalTypeError("UUID", u.toString))
        case _ => Success(())
      }
      val fields = record.getSchema.getFields.asScala.toList
      fields.traverse(f => checkAll(record.get(f.name), f.schema.some)).void
    }

    private def getAllPayloadFieldNames: Set[String] = {
      import spray.json._
      def loop(cur: JsValue, extraName: Option[String]): Set[String] = cur match {
        case JsObject(f) => f.flatMap { case (k: String, v: JsValue) =>
          loop(v, k.some) ++ Set(extraName.getOrElse("") + k)
        }.toSet
        case _ => Set.empty
      }
      loop(s.parseJson, None)
    }

    private def getAllSchemaFieldNames(schema: Schema): Set[String] = {
      import Schema.Type._
      import collection.JavaConverters._
      def loop(sch: Schema, extraName: Option[String]): Set[String] = sch.getType match {
        case RECORD => sch.getFields.asScala.toSet.flatMap { f: Schema.Field =>
          loop(f.schema, f.name.some) ++ Set(extraName.getOrElse("") + f.name)
        }
        case _ => Set.empty
      }
      loop(schema, None)
    }

    def toGenericRecord(schema: Schema, useStrictValidation: Boolean): Try[GenericRecord] = Try {
      if (useStrictValidation) {
        val diff = getAllPayloadFieldNames diff getAllSchemaFieldNames(schema)
        if (diff.nonEmpty) throw ValidationExtraFieldsError(diff)
      }
      val decoderFactory = new DecoderFactory
      val decoder = decoderFactory.jsonDecoder(schema, s)
      val reader = new GenericDatumReader[GenericRecord](schema)
      reader.read(null, decoder)
    }.flatTap(checkLogicalTypes)
  }

}