org.apache.parquet.schema.MessageType Scala Examples

The following examples show how to use org.apache.parquet.schema.MessageType. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.

Example 1

Source File: WriteAndReadGenericApp.scala From parquet4s with MIT License

5 votes

package com.github.mjakubowski84.parquet4s.core

import java.time.{LocalDate, ZoneOffset}
import java.util.TimeZone

import com.github.mjakubowski84.parquet4s.{ParquetReader, ParquetWriter, RowParquetRecord, ValueCodecConfiguration}
import com.google.common.io.Files
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.{BINARY, INT32, INT64}
import org.apache.parquet.schema.Type.Repetition.{OPTIONAL, REQUIRED}
import org.apache.parquet.schema.{MessageType, OriginalType, Types}

object WriteAndReadGenericApp extends App {

  val ID = "id"
  val Name = "name"
  val Birthday = "birthday"
  val Schema = "user_schema"

  val path = Files.createTempDir().getAbsolutePath
  val vcc = ValueCodecConfiguration(TimeZone.getTimeZone(ZoneOffset.UTC))

  val users = List(
    (1L, "Alice", LocalDate.of(2000, 1, 1)),
    (2L, "Bob", LocalDate.of(1980, 2, 28)),
    (3L, "Cecilia", LocalDate.of(1977, 3, 15))
  ).map { case (id, name, birthday) =>
    RowParquetRecord.empty
      .add(ID, id, vcc)
      .add(Name, name, vcc)
      .add(Birthday, birthday, vcc)
  }

  // write
  implicit val schema: MessageType = Types.buildMessage()
    .addField(Types.primitive(INT64, REQUIRED).as(OriginalType.INT_64).named(ID))
    .addField(Types.primitive(BINARY, OPTIONAL).as(OriginalType.UTF8).named(Name))
    .addField(Types.primitive(INT32, OPTIONAL).as(OriginalType.DATE).named(Birthday))
    .named(Schema)

  ParquetWriter.writeAndClose(s"$path/users.parquet", users)

  //read
  val readData = ParquetReader.read[RowParquetRecord](path)
  try {
    readData.foreach { record =>
      val id = record.get[Long](ID, vcc)
      val name = record.get[String](Name, vcc)
      val birthday = record.get[LocalDate](Birthday, vcc)
      println(s"User[$ID=$id,$Name=$name,$Birthday=$birthday]")
    }
  } finally readData.close()

}

Example 2

Source File: UnorderedParallelParquetSink.scala From parquet4s with MIT License

5 votes

package com.github.mjakubowski84.parquet4s

import java.util.UUID

import akka.Done
import akka.stream.scaladsl.{Flow, Keep, Sink}
import org.apache.hadoop.fs.Path
import org.apache.parquet.schema.MessageType
import org.slf4j.{Logger, LoggerFactory}

import scala.concurrent.Future

private[parquet4s] object UnorderedParallelParquetSink extends IOOps {

  protected val logger: Logger = LoggerFactory.getLogger(this.getClass)

  def apply[T: ParquetRecordEncoder : ParquetSchemaResolver](path: Path,
                                                             parallelism: Int,
                                                             options: ParquetWriter.Options = ParquetWriter.Options()
                                                            ): Sink[T, Future[Done]] = {
    val schema = ParquetSchemaResolver.resolveSchema[T]
    val valueCodecConfiguration = options.toValueCodecConfiguration

    validateWritePath(path, options)

    def encode(data: T): RowParquetRecord = ParquetRecordEncoder.encode[T](data, valueCodecConfiguration)

    Flow[T]
      .zipWithIndex
      .groupBy(parallelism, elemAndIndex => Math.floorMod(elemAndIndex._2, parallelism))
      .map(elemAndIndex => encode(elemAndIndex._1))
      .fold(UnorderedChunk(path, schema, options))(_.write(_))
      .map(_.close())
      .async
      .mergeSubstreamsWithParallelism(parallelism)
      .toMat(Sink.ignore)(Keep.right)
  }

  private trait UnorderedChunk {

    def write(record: RowParquetRecord): UnorderedChunk

    def close(): Unit

  }

  private object UnorderedChunk {

    def apply(basePath: Path,
              schema: MessageType,
              options: ParquetWriter.Options): UnorderedChunk = new PendingUnorderedChunk(basePath, schema, options)

    private[UnorderedChunk] class PendingUnorderedChunk(basePath: Path,
                                        schema: MessageType,
                                        options: ParquetWriter.Options) extends UnorderedChunk {
      override def write(record: RowParquetRecord): UnorderedChunk = {
        val chunkPath = Path.mergePaths(basePath, new Path(s"/part-${UUID.randomUUID()}.parquet"))
        val writer = ParquetWriter.internalWriter(chunkPath, schema, options)
        writer.write(record)
        new StartedUnorderedChunk(chunkPath, writer, acc = 1)
      }

      override def close(): Unit = ()
    }

    private[UnorderedChunk] class StartedUnorderedChunk(chunkPath: Path,
                                        writer: ParquetWriter.InternalWriter,
                                        acc: Long
                                       ) extends UnorderedChunk {
      override def write(record: RowParquetRecord): UnorderedChunk = {
        writer.write(record)
        new StartedUnorderedChunk(chunkPath, writer, acc = acc + 1)
      }

      override def close(): Unit = {
        if (logger.isDebugEnabled) logger.debug(s"$acc records were successfully written to $chunkPath")
        writer.close()
      }
    }
  }

}

Example 3

Source File: SequentialFileSplittingParquetSink.scala From parquet4s with MIT License

5 votes

package com.github.mjakubowski84.parquet4s

import akka.Done
import akka.stream.scaladsl.{Flow, Keep, Sink}
import org.apache.hadoop.fs.Path
import org.apache.parquet.schema.MessageType
import org.slf4j.{Logger, LoggerFactory}

import scala.concurrent.Future

private[parquet4s] object SequentialFileSplittingParquetSink extends IOOps {

  protected val logger: Logger = LoggerFactory.getLogger(this.getClass)

  def apply[T: ParquetRecordEncoder : ParquetSchemaResolver](path: Path,
                                                             maxRecordsPerFile: Long,
                                                             options: ParquetWriter.Options = ParquetWriter.Options()
                                                            ): Sink[T, Future[Done]] = {
    val schema = ParquetSchemaResolver.resolveSchema[T]
    val valueCodecConfiguration = options.toValueCodecConfiguration

    validateWritePath(path, options)

    def encode(data: T): RowParquetRecord = ParquetRecordEncoder.encode[T](data, valueCodecConfiguration)

    Flow[T]
      .zipWithIndex
      .map { case (elem, index) => OrderedChunkElem(encode(elem), index) }
      .fold(OrderedChunk(path, schema, maxRecordsPerFile, options))(_.write(_))
      .map(_.close())
      .toMat(Sink.ignore)(Keep.right)
  }

  private case class OrderedChunkElem(record: RowParquetRecord, index: Long) {
    def isSplit(maxRecordsPerFile: Long): Boolean = index % maxRecordsPerFile == 0
  }

  private trait OrderedChunk {
    def write(elem: OrderedChunkElem): OrderedChunk
    def close(): Unit
  }

  private object OrderedChunk {

    def apply(basePath: Path,
              schema: MessageType,
              maxRecordsPerFile: Long,
              options: ParquetWriter.Options): OrderedChunk = new PendingOrderedChunk(basePath, schema, maxRecordsPerFile, options)


    private[OrderedChunk] class PendingOrderedChunk(basePath: Path,
                                                    schema: MessageType,
                                                    maxRecordsPerFile: Long,
                                                    options: ParquetWriter.Options) extends OrderedChunk {
      override def write(elem: OrderedChunkElem): OrderedChunk = {
        val chunkNumber: Int = Math.floorDiv(elem.index, maxRecordsPerFile).toInt
        val chunkPath = Path.mergePaths(basePath, new Path(chunkFileName(chunkNumber)))
        val writer = ParquetWriter.internalWriter(chunkPath, schema, options)
        writer.write(elem.record)
        new StartedOrderedChunk(basePath, schema, maxRecordsPerFile, options, chunkPath, writer, acc = 1)
      }

      override def close(): Unit = ()

      private def chunkFileName(chunkNumber: Int): String = f"/part-$chunkNumber%05d.parquet"
    }

    private[OrderedChunk] class StartedOrderedChunk(basePath: Path,
                                                    schema: MessageType,
                                                    maxRecordsPerFile: Long,
                                                    options: ParquetWriter.Options,
                                                    chunkPath: Path,
                                                    writer: ParquetWriter.InternalWriter,
                                                    acc: Long) extends OrderedChunk {
      override def write(elem: OrderedChunkElem): OrderedChunk = {
        if (elem.isSplit(maxRecordsPerFile)) {
          this.close()
          new PendingOrderedChunk(basePath, schema, maxRecordsPerFile, options).write(elem)
        } else {
          writer.write(elem.record)
          new StartedOrderedChunk(basePath, schema, maxRecordsPerFile, options, chunkPath, writer, acc = acc + 1)
        }
      }

      override def close(): Unit = {
        if (logger.isDebugEnabled) logger.debug(s"$acc records were successfully written to $chunkPath")
        writer.close()
      }
    }
  }

}

Example 4

Source File: ParquetPublisher.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import java.util.concurrent.atomic.AtomicBoolean

import com.sksamuel.exts.Logging
import com.sksamuel.exts.OptionImplicits._
import com.sksamuel.exts.io.Using
import io.eels.component.parquet.util.ParquetIterator
import io.eels.datastream.{DataStream, Publisher, Subscriber, Subscription}
import io.eels.schema.StructType
import io.eels.{Predicate, Row}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.format.converter.ParquetMetadataConverter
import org.apache.parquet.hadoop.ParquetFileReader
import org.apache.parquet.schema.MessageType

class ParquetPublisher(path: Path,
                       predicate: Option[Predicate],
                       projection: Seq[String],
                       caseSensitive: Boolean,
                       dictionaryFiltering: Boolean)
                      (implicit conf: Configuration) extends Publisher[Seq[Row]] with Logging with Using {

  def readSchema: Option[MessageType] = {
    if (projection.isEmpty) None
    else {

      val fileSchema = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER).getFileMetaData.getSchema
      val structType = ParquetSchemaFns.fromParquetMessageType(fileSchema)

      if (caseSensitive) {
        assert(
          structType.fieldNames.toSet.size == structType.fieldNames.map(_.toLowerCase).toSet.size,
          "Cannot use case sensitive = true when this would result in a clash of field names"
        )
      }

      val projectionSchema = StructType(projection.map { field =>
        structType.field(field, caseSensitive).getOrError(s"Requested field $field does not exist in the parquet schema")
      })

      ParquetSchemaFns.toParquetMessageType(projectionSchema).some
    }
  }

  override def subscribe(subscriber: Subscriber[Seq[Row]]): Unit = {
    try {
      using(RowParquetReaderFn(path, predicate, readSchema, dictionaryFiltering)) { reader =>
        val running = new AtomicBoolean(true)
        subscriber.subscribed(Subscription.fromRunning(running))
        ParquetIterator(reader)
          .grouped(DataStream.DefaultBatchSize)
          .takeWhile(_ => running.get)
          .foreach(subscriber.next)
        subscriber.completed()
      }
    } catch {
      case t: Throwable => subscriber.error(t)
    }
  }
}

Example 5

Source File: RowWriteSupport.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import com.sksamuel.exts.Logging
import io.eels.Row
import org.apache.hadoop.conf.Configuration
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext
import org.apache.parquet.io.api.RecordConsumer
import org.apache.parquet.schema.MessageType

import scala.collection.JavaConverters._
import scala.math.BigDecimal.RoundingMode.RoundingMode

// implementation of WriteSupport for Row's used by the native ParquetWriter
class RowWriteSupport(schema: MessageType,
                      roundingMode: RoundingMode,
                      metadata: Map[String, String]) extends WriteSupport[Row] with Logging {
  logger.trace(s"Created parquet row write support for schema message type $schema")

  private var writer: RowWriter = _

  override def finalizeWrite(): FinalizedWriteContext = new FinalizedWriteContext(metadata.asJava)

  def init(configuration: Configuration): WriteSupport.WriteContext = {
    new WriteSupport.WriteContext(schema, new java.util.HashMap())
  }

  def prepareForWrite(record: RecordConsumer) {
    writer = new RowWriter(record, roundingMode)
  }

  def write(row: Row) {
    writer.write(row)
  }
}

class RowWriter(record: RecordConsumer, roundingMode: RoundingMode) {

  def write(row: Row): Unit = {
    record.startMessage()
    val writer = new StructRecordWriter(row.schema, roundingMode, false)
    writer.write(record, row.values)
    record.endMessage()
  }
}

Example 6

Source File: RowParquetWriterFn.scala From eel-sdk with Apache License 2.0

5 votes

package io.eels.component.parquet

import io.eels.Row
import io.eels.schema.StructType
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.parquet.column.ParquetProperties
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.{ParquetFileWriter, ParquetWriter}
import org.apache.parquet.schema.MessageType

import scala.math.BigDecimal.RoundingMode.RoundingMode


object RowParquetWriterFn {

  class RowParquetWriterBuilder(path: Path,
                                schema: MessageType,
                                roundingMode: RoundingMode,
                                metadata: Map[String, String])
    extends ParquetWriter.Builder[Row, RowParquetWriterBuilder](path) {
    override def getWriteSupport(conf: Configuration): WriteSupport[Row] = new RowWriteSupport(schema, roundingMode, metadata)
    override def self(): RowParquetWriterBuilder = this
  }

  def apply(path: Path,
            schema: StructType,
            metadata: Map[String, String],
            dictionary: Boolean,
            roundingMode: RoundingMode,
            fsConfig: Configuration): ParquetWriter[Row] = {
    val config = ParquetWriterConfig()
    val messageType = ParquetSchemaFns.toParquetMessageType(schema)
    new RowParquetWriterBuilder(path, messageType, roundingMode, metadata)
      .withCompressionCodec(config.compressionCodec)
      .withDictionaryEncoding(dictionary)
      .withPageSize(config.pageSize)
      .withRowGroupSize(config.blockSize)
      .withValidation(config.validating)
      .withWriteMode(ParquetFileWriter.Mode.CREATE)
      .withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
      .withConf(fsConfig)
      .build()
  }
}

Example 7

Source File: DirectParquetWriter.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import scala.collection.JavaConverters._

import org.apache.hadoop.conf
import org.apache.hadoop.fs.Path
import org.apache.parquet.hadoop.ParquetWriter
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
import org.apache.parquet.io.api.RecordConsumer
import org.apache.parquet.schema.{MessageType, MessageTypeParser}

  private class DirectWriteSupport(schema: MessageType, metadata: Map[String, String])
    extends WriteSupport[RecordBuilder] {

    private var recordConsumer: RecordConsumer = _
    //初始化
    override def init(configuration: conf.Configuration): WriteContext = {
      new WriteContext(schema, metadata.asJava)
    }
    //写操作
    override def write(buildRecord: RecordBuilder): Unit = {
      recordConsumer.startMessage()
      buildRecord(recordConsumer)
      recordConsumer.endMessage()
    }
    //准备写
    override def prepareForWrite(recordConsumer: RecordConsumer): Unit = {
      this.recordConsumer = recordConsumer
    }
  }
  //直接写入
  def writeDirect
      (path: String, schema: String, metadata: Map[String, String] = Map.empty)
      (f: ParquetWriter[RecordBuilder] => Unit): Unit = {
    //println("==1111==")
    val messageType = MessageTypeParser.parseMessageType(schema)
    val writeSupport = new DirectWriteSupport(messageType, metadata)
    // println("==2222==")
    val parquetWriter = new ParquetWriter[RecordBuilder](new Path(path), writeSupport)
     // println("==3333==")
    try f(parquetWriter) finally parquetWriter.close()
  }
  //消息
  def message(writer: ParquetWriter[RecordBuilder])(builder: RecordBuilder): Unit = {
    writer.write(builder)
  }
  //分组
  def group(consumer: RecordConsumer)(f: => Unit): Unit = {
    consumer.startGroup()
    f
    consumer.endGroup()
  }
  //字段
  def field(consumer: RecordConsumer, name: String, index: Int = 0)(f: => Unit): Unit = {
    consumer.startField(name, index)
    f
    consumer.endField(name, index)
  }
}

Example 8

Source File: ParquetCompatibilityTest.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import scala.collection.JavaConversions._

import org.apache.hadoop.fs.{Path, PathFilter}
import org.apache.parquet.hadoop.ParquetFileReader
import org.apache.parquet.schema.MessageType

import org.apache.spark.sql.QueryTest


private[sql] abstract class ParquetCompatibilityTest extends QueryTest with ParquetTest {
  protected def readParquetSchema(path: String): MessageType = {
    readParquetSchema(path, { path => !path.getName.startsWith("_") })
  }
  //读Parquet模式
  protected def readParquetSchema(path: String, pathFilter: Path => Boolean): MessageType = {
    val fsPath = new Path(path)
    val fs = fsPath.getFileSystem(configuration)
    val parquetFiles = fs.listStatus(fsPath, new PathFilter {
      override def accept(path: Path): Boolean = pathFilter(path)
    }).toSeq

    val footers = ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles, true)
    footers.head.getParquetMetadata.getFileMetaData.getSchema
  }

  protected def logParquetSchema(path: String): Unit = {
    logInfo(
      //由parquet-avro写的Parquet文件的模式
      s"""Schema of the Parquet file written by parquet-avro:
         |${readParquetSchema(path)}
       """.stripMargin)
  }
}
//复合Parquet的兼容性测试
object ParquetCompatibilityTest {
  def makeNullable[T <: AnyRef](i: Int)(f: => T): T = {
    if (i % 3 == 0) null.asInstanceOf[T] else f
  }
}

Example 9

Source File: ParquetCompatibilityTest.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsJavaMapConverter, seqAsJavaListConverter}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{Path, PathFilter}
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.WriteContext
import org.apache.parquet.hadoop.{ParquetFileReader, ParquetWriter}
import org.apache.parquet.io.api.RecordConsumer
import org.apache.parquet.schema.{MessageType, MessageTypeParser}

import org.apache.spark.sql.QueryTest


  def writeDirect(
      path: String,
      schema: String,
      metadata: Map[String, String],
      recordWriters: (RecordConsumer => Unit)*): Unit = {
    val messageType = MessageTypeParser.parseMessageType(schema)
    val writeSupport = new DirectWriteSupport(messageType, metadata)
    val parquetWriter = new ParquetWriter[RecordConsumer => Unit](new Path(path), writeSupport)
    try recordWriters.foreach(parquetWriter.write) finally parquetWriter.close()
  }
}

Example 10

Source File: StructReadSupport.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import java.util

import org.apache.hadoop.conf.Configuration
import org.apache.kafka.connect.data.Struct
import org.apache.parquet.hadoop.api.{InitContext, ReadSupport}
import org.apache.parquet.io.api.RecordMaterializer
import org.apache.parquet.schema.MessageType

class StructReadSupport extends ReadSupport[Struct] {

  override def prepareForRead(configuration: Configuration,
                              metaData: util.Map[String, String],
                              fileSchema: MessageType,
                              context: ReadSupport.ReadContext): RecordMaterializer[Struct] = {
    // the file schema in here comes from the footer of the parquet file
    val schema = ParquetSchemas.toKafka(fileSchema)
    new StructMaterializer(schema)
  }

  override def init(context: InitContext): ReadSupport.ReadContext = {
    new ReadSupport.ReadContext(context.getFileSchema)
  }
}

Example 11

Source File: StructWriteSupport.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive._
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext
import org.apache.parquet.io.api.{Binary, RecordConsumer}
import org.apache.parquet.schema.MessageType

import scala.collection.JavaConverters._

// derived from Apache Spark's parquet write support, archive and license here:
// https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)
  private val schemaName = if (schema.name() == null) "schema" else schema.name()
  private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName)

  private val metadata = new java.util.HashMap[String, String]()
  metadata.put("written_by", "streamreactor")

  // The Parquet `RecordConsumer` to which all structs are written
  private var consumer: RecordConsumer = _

  type ValueWriter = (Any) => Unit

  override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String])
  override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata)
  override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer

  override def write(struct: Struct): Unit = {
    writeMessage {
      writeStructFields(struct)
    }
  }

  private def writeStructFields(struct: Struct): Unit = {
    for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) {
      val value = struct.get(field)
      if (value != null) {
        val writer = valueWriter(field.schema())
        writeField(field.name, index) {
          writer(value)
        }
      }
    }
  }

  def valueWriter(schema: Schema): ValueWriter = {
    // todo perhaps introduce something like spark's SpecializedGetters
    schema.`type`() match {
      case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean])
      case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt)
      case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong)
      case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes))
      case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat)
      case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble)
      case Schema.Type.STRUCT => value => {
        logger.debug(s"Writing nested struct")
        val struct = value.asInstanceOf[Struct]
        writeGroup {
          schema.fields.asScala
            .map { field => field -> struct.get(field) }
            .zipWithIndex.foreach { case ((field, v), k) =>
            writeField(field.name, k) {
              valueWriter(field.schema)(v)
            }
          }
        }
      }
      case _ => throw UnsupportedSchemaType(schema.`type`.toString)
    }
  }

  private def writeMessage(f: => Unit): Unit = {
    consumer.startMessage()
    f
    consumer.endMessage()
  }

  private def writeGroup(f: => Unit): Unit = {
    consumer.startGroup()
    // consumer.startMessage()
    f
    //consumer.endMessage()
    consumer.endGroup()
  }

  private def writeField(name: String, k: Int)(f: => Unit): Unit = {
    consumer.startField(name, k)
    f
    consumer.endField(name, k)
  }
}

Example 12

Source File: StructReadSupport.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import java.util

import org.apache.hadoop.conf.Configuration
import org.apache.kafka.connect.data.Struct
import org.apache.parquet.hadoop.api.{InitContext, ReadSupport}
import org.apache.parquet.io.api.RecordMaterializer
import org.apache.parquet.schema.MessageType

class StructReadSupport extends ReadSupport[Struct] {

  override def prepareForRead(configuration: Configuration,
                              metaData: util.Map[String, String],
                              fileSchema: MessageType,
                              context: ReadSupport.ReadContext): RecordMaterializer[Struct] = {
    // the file schema in here comes from the footer of the parquet file
    val schema = ParquetSchemas.toKafka(fileSchema)
    new StructMaterializer(schema)
  }

  override def init(context: InitContext): ReadSupport.ReadContext = {
    new ReadSupport.ReadContext(context.getFileSchema)
  }
}

Example 13

Source File: StructWriteSupport.scala From stream-reactor with Apache License 2.0

5 votes

package com.landoop.streamreactor.connect.hive.parquet

import com.landoop.streamreactor.connect.hive._
import org.apache.hadoop.conf.Configuration
import org.apache.kafka.connect.data.{Schema, Struct}
import org.apache.parquet.hadoop.api.WriteSupport
import org.apache.parquet.hadoop.api.WriteSupport.FinalizedWriteContext
import org.apache.parquet.io.api.{Binary, RecordConsumer}
import org.apache.parquet.schema.MessageType

import scala.collection.JavaConverters._

// derived from Apache Spark's parquet write support, archive and license here:
// https://github.com/apache/spark/blob/21a7bfd5c324e6c82152229f1394f26afeae771c/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetWriteSupport.scala
class StructWriteSupport(schema: Schema) extends WriteSupport[Struct] {

  private val logger = org.slf4j.LoggerFactory.getLogger(getClass.getName)
  private val schemaName = if (schema.name() == null) "schema" else schema.name()
  private val parquetSchema: MessageType = ParquetSchemas.toParquetMessage(schema, schemaName)

  private val metadata = new java.util.HashMap[String, String]()
  metadata.put("written_by", "streamreactor")

  // The Parquet `RecordConsumer` to which all structs are written
  private var consumer: RecordConsumer = _

  type ValueWriter = (Any) => Unit

  override def init(conf: Configuration): WriteSupport.WriteContext = new WriteSupport.WriteContext(parquetSchema, new java.util.HashMap[String, String])
  override def finalizeWrite(): WriteSupport.FinalizedWriteContext = new FinalizedWriteContext(metadata)
  override def prepareForWrite(consumer: RecordConsumer): Unit = this.consumer = consumer

  override def write(struct: Struct): Unit = {
    writeMessage {
      writeStructFields(struct)
    }
  }

  private def writeStructFields(struct: Struct): Unit = {
    for ((field, index) <- struct.schema.fields.asScala.zipWithIndex) {
      val value = struct.get(field)
      if (value != null) {
        val writer = valueWriter(field.schema())
        writeField(field.name, index) {
          writer(value)
        }
      }
    }
  }

  def valueWriter(schema: Schema): ValueWriter = {
    // todo perhaps introduce something like spark's SpecializedGetters
    schema.`type`() match {
      case Schema.Type.BOOLEAN => value => consumer.addBoolean(value.asInstanceOf[Boolean])
      case Schema.Type.INT8 | Schema.Type.INT16 | Schema.Type.INT32 => value => consumer.addInteger(value.toString.toInt)
      case Schema.Type.INT64 => value => consumer.addLong(value.toString.toLong)
      case Schema.Type.STRING => value => consumer.addBinary(Binary.fromReusedByteArray(value.toString.getBytes))
      case Schema.Type.FLOAT32 => value => consumer.addFloat(value.toString.toFloat)
      case Schema.Type.FLOAT64 => value => consumer.addDouble(value.toString.toDouble)
      case Schema.Type.STRUCT => value => {
        logger.debug(s"Writing nested struct")
        val struct = value.asInstanceOf[Struct]
        writeGroup {
          schema.fields.asScala
            .map { field => field -> struct.get(field) }
            .zipWithIndex.foreach { case ((field, v), k) =>
            writeField(field.name, k) {
              valueWriter(field.schema)(v)
            }
          }
        }
      }
      case _ => throw UnsupportedSchemaType(schema.`type`.toString)
    }
  }

  private def writeMessage(f: => Unit): Unit = {
    consumer.startMessage()
    f
    consumer.endMessage()
  }

  private def writeGroup(f: => Unit): Unit = {
    consumer.startGroup()
    // consumer.startMessage()
    f
    //consumer.endMessage()
    consumer.endGroup()
  }

  private def writeField(name: String, k: Int)(f: => Unit): Unit = {
    consumer.startField(name, k)
    f
    consumer.endField(name, k)
  }
}