* Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *      http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.

package org.apache.hudi.hadoop.utils;

import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.io.storage.HoodieFileReader;
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericArray;
import org.apache.avro.generic.GenericFixed;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;

public class HoodieRealtimeRecordReaderUtils {

   * Reads the schema from the base file.
  public static Schema readSchema(Configuration conf, Path filePath) {
    try {
      HoodieFileReader storageReader = HoodieFileReaderFactory.getFileReader(conf, filePath);
      return storageReader.getSchema();
    } catch (IOException e) {
      throw new HoodieIOException("Failed to read schema from " + filePath, e);

   * Prints a JSON representation of the ArrayWritable for easier debuggability.
  public static String arrayWritableToString(ArrayWritable writable) {
    if (writable == null) {
      return "null";
    StringBuilder builder = new StringBuilder();
    Writable[] values = writable.get();
    builder.append("\"values_" + Math.random() + "_" + values.length + "\": {");
    int i = 0;
    for (Writable w : values) {
      if (w instanceof ArrayWritable) {
        builder.append(arrayWritableToString((ArrayWritable) w)).append(",");
      } else {
        builder.append("\"value" + i + "\":\"" + w + "\"").append(",");
        if (w == null) {
          builder.append("\"type" + i + "\":\"unknown\"").append(",");
        } else {
          builder.append("\"type" + i + "\":\"" + w.getClass().getSimpleName() + "\"").append(",");
    builder.deleteCharAt(builder.length() - 1);
    return builder.toString();

   * Generate a reader schema off the provided writeSchema, to just project out the provided columns.
  public static Schema generateProjectionSchema(Schema writeSchema, Map<String, Schema.Field> schemaFieldsMap,
                                                List<String> fieldNames) {
     * Avro & Presto field names seems to be case sensitive (support fields differing only in case) whereas
     * Hive/Impala/SparkSQL(default) are case-insensitive. Spark allows this to be configurable using
     * spark.sql.caseSensitive=true
     * For a RT table setup with no delta-files (for a latest file-slice) -> we translate parquet schema to Avro Here
     * the field-name case is dependent on parquet schema. Hive (1.x/2.x/CDH) translate column projections to
     * lower-cases
    List<Schema.Field> projectedFields = new ArrayList<>();
    for (String fn : fieldNames) {
      Schema.Field field = schemaFieldsMap.get(fn.toLowerCase());
      if (field == null) {
        throw new HoodieException("Field " + fn + " not found in log schema. Query cannot proceed! "
            + "Derived Schema Fields: " + new ArrayList<>(schemaFieldsMap.keySet()));
      } else {
        projectedFields.add(new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal()));

    Schema projectedSchema = Schema.createRecord(writeSchema.getName(), writeSchema.getDoc(),
        writeSchema.getNamespace(), writeSchema.isError());
    return projectedSchema;

  public static Map<String, Schema.Field> getNameToFieldMap(Schema schema) {
    return schema.getFields().stream().map(r -> Pair.of(r.name().toLowerCase(), r))
        .collect(Collectors.toMap(Pair::getLeft, Pair::getRight));

   * Convert the projected read from delta record into an array writable.
  public static Writable avroToArrayWritable(Object value, Schema schema) {

    if (value == null) {
      return null;

    switch (schema.getType()) {
      case STRING:
        return new Text(value.toString());
      case BYTES:
        return new BytesWritable(((ByteBuffer)value).array());
      case INT:
        return new IntWritable((Integer) value);
      case LONG:
        return new LongWritable((Long) value);
      case FLOAT:
        return new FloatWritable((Float) value);
      case DOUBLE:
        return new DoubleWritable((Double) value);
      case BOOLEAN:
        return new BooleanWritable((Boolean) value);
      case NULL:
        return null;
      case RECORD:
        GenericRecord record = (GenericRecord) value;
        Writable[] recordValues = new Writable[schema.getFields().size()];
        int recordValueIndex = 0;
        for (Schema.Field field : schema.getFields()) {
          recordValues[recordValueIndex++] = avroToArrayWritable(record.get(field.name()), field.schema());
        return new ArrayWritable(Writable.class, recordValues);
      case ENUM:
        return new Text(value.toString());
      case ARRAY:
        GenericArray arrayValue = (GenericArray) value;
        Writable[] arrayValues = new Writable[arrayValue.size()];
        int arrayValueIndex = 0;
        for (Object obj : arrayValue) {
          arrayValues[arrayValueIndex++] = avroToArrayWritable(obj, schema.getElementType());
        // Hive 1.x will fail here, it requires values2 to be wrapped into another ArrayWritable
        return new ArrayWritable(Writable.class, arrayValues);
      case MAP:
        Map mapValue = (Map) value;
        Writable[] mapValues = new Writable[mapValue.size()];
        int mapValueIndex = 0;
        for (Object entry : mapValue.entrySet()) {
          Map.Entry mapEntry = (Map.Entry) entry;
          Writable[] nestedMapValues = new Writable[2];
          nestedMapValues[0] = new Text(mapEntry.getKey().toString());
          nestedMapValues[1] = avroToArrayWritable(mapEntry.getValue(), schema.getValueType());
          mapValues[mapValueIndex++] = new ArrayWritable(Writable.class, nestedMapValues);
        // Hive 1.x will fail here, it requires values3 to be wrapped into another ArrayWritable
        return new ArrayWritable(Writable.class, mapValues);
      case UNION:
        List<Schema> types = schema.getTypes();
        if (types.size() != 2) {
          throw new IllegalArgumentException("Only support union with 2 fields");
        Schema s1 = types.get(0);
        Schema s2 = types.get(1);
        if (s1.getType() == Schema.Type.NULL) {
          return avroToArrayWritable(value, s2);
        } else if (s2.getType() == Schema.Type.NULL) {
          return avroToArrayWritable(value, s1);
        } else {
          throw new IllegalArgumentException("Only support union with null");
      case FIXED:
        if (schema.getLogicalType() != null && schema.getLogicalType().getName().equals("decimal")) {
          LogicalTypes.Decimal decimal = (LogicalTypes.Decimal) LogicalTypes.fromSchema(schema);
          HiveDecimalWritable writable = new HiveDecimalWritable(((GenericFixed) value).bytes(),
          return HiveDecimalWritable.enforcePrecisionScale(writable,
        return new BytesWritable(((GenericFixed) value).bytes());
        return null;

   * Given a comma separated list of field names and positions at which they appear on Hive, return
   * an ordered list of field names, that can be passed onto storage.
  public static List<String> orderFields(String fieldNameCsv, String fieldOrderCsv, List<String> partitioningFields) {
    // Need to convert the following to Set first since Hive does not handle duplicate field names correctly but
    // handles duplicate fields orders correctly.
    // Fields Orders -> {@link https://github
    // .com/apache/hive/blob/f37c5de6c32b9395d1b34fa3c02ed06d1bfbf6eb/serde/src/java
    // /org/apache/hadoop/hive/serde2/ColumnProjectionUtils.java#L188}
    // Field Names -> {@link https://github.com/apache/hive/blob/f37c5de6c32b9395d1b34fa3c02ed06d1bfbf6eb/serde/src/java
    // /org/apache/hadoop/hive/serde2/ColumnProjectionUtils.java#L229}
    String[] fieldOrdersWithDups = fieldOrderCsv.split(",");
    Set<String> fieldOrdersSet = new LinkedHashSet<>(Arrays.asList(fieldOrdersWithDups));
    String[] fieldOrders = fieldOrdersSet.toArray(new String[0]);
    List<String> fieldNames = Arrays.stream(fieldNameCsv.split(","))
        .filter(fn -> !partitioningFields.contains(fn)).collect(Collectors.toList());
    Set<String> fieldNamesSet = new LinkedHashSet<>(fieldNames);
    // Hive does not provide ids for partitioning fields, so check for lengths excluding that.
    if (fieldNamesSet.size() != fieldOrders.length) {
      throw new HoodieException(String
          .format("Error ordering fields for storage read. #fieldNames: %d, #fieldPositions: %d",
              fieldNames.size(), fieldOrders.length));
    TreeMap<Integer, String> orderedFieldMap = new TreeMap<>();
    String[] fieldNamesArray = fieldNamesSet.toArray(new String[0]);
    for (int ox = 0; ox < fieldOrders.length; ox++) {
      orderedFieldMap.put(Integer.parseInt(fieldOrders[ox]), fieldNamesArray[ox]);
    return new ArrayList<>(orderedFieldMap.values());

   * Hive implementation of ParquetRecordReader results in partition columns not present in the original parquet file to
   * also be part of the projected schema. Hive expects the record reader implementation to return the row in its
   * entirety (with un-projected column having null values). As we use writerSchema for this, make sure writer schema
   * also includes partition columns
   * @param schema Schema to be changed
  public static Schema addPartitionFields(Schema schema, List<String> partitioningFields) {
    final Set<String> firstLevelFieldNames =
    List<String> fieldsToAdd = partitioningFields.stream().map(String::toLowerCase)
        .filter(x -> !firstLevelFieldNames.contains(x)).collect(Collectors.toList());

    return HoodieAvroUtils.appendNullSchemaFields(schema, fieldsToAdd);