org.apache.orc.TypeDescription Java Examples

The following examples show how to use org.apache.orc.TypeDescription. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestORCSchemaUtil.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testInvalidTypePromotions() {
  Schema originalSchema = new Schema(
      optional(1, "a", Types.LongType.get())
  );

  TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema);
  Schema evolveSchema = new Schema(
      optional(1, "a", Types.IntegerType.get())
  );

  assertThrows("Should not allow invalid type promotion",
      IllegalArgumentException.class, "Can not promote", () -> {
        ORCSchemaUtil.buildOrcProjection(evolveSchema, orcSchema);
      });
}
 
Example #2
Source File: OrcSchemaWithTypeVisitor.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public static <T> T visit(Type iType, TypeDescription schema, OrcSchemaWithTypeVisitor<T> visitor) {
  switch (schema.getCategory()) {
    case STRUCT:
      return visitRecord(iType != null ? iType.asStructType() : null, schema, visitor);

    case UNION:
      throw new UnsupportedOperationException("Cannot handle " + schema);

    case LIST:
      Types.ListType list = iType != null ? iType.asListType() : null;
      return visitor.list(
          list, schema,
          visit(list.elementType(), schema.getChildren().get(0), visitor));

    case MAP:
      Types.MapType map = iType != null ? iType.asMapType() : null;
      return visitor.map(
          map, schema,
          visit(map != null ? map.keyType() : null, schema.getChildren().get(0), visitor),
          visit(map != null ? map.valueType() : null, schema.getChildren().get(1), visitor));

    default:
      return visitor.primitive(iType != null ? iType.asPrimitiveType() : null, schema);
  }
}
 
Example #3
Source File: OrcRowInputFormat.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
	batchSize = in.readInt();
	org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration();
	configuration.readFields(in);

	if (this.conf == null) {
		this.conf = configuration;
	}
	this.schema = TypeDescription.fromString(in.readUTF());

	this.selectedFields = new int[in.readInt()];
	for (int i = 0; i < selectedFields.length; i++) {
		this.selectedFields[i] = in.readInt();
	}

	this.conjunctPredicates = new ArrayList<>();
	int numPreds = in.readInt();
	for (int i = 0; i < numPreds; i++) {
		conjunctPredicates.add((Predicate) in.readObject());
	}
}
 
Example #4
Source File: OrcColumnarRowSplitReader.java    From flink with Apache License 2.0 6 votes vote down vote up
public OrcColumnarRowSplitReader(
		OrcShim<BATCH> shim,
		Configuration conf,
		TypeDescription schema,
		int[] selectedFields,
		ColumnBatchGenerator<BATCH> batchGenerator,
		List<Predicate> conjunctPredicates,
		int batchSize,
		Path path,
		long splitStart,
		long splitLength) throws IOException {
	super(
			shim,
			conf,
			schema,
			selectedFields,
			conjunctPredicates,
			batchSize,
			path,
			splitStart,
			splitLength);

	this.columnarBatch = batchGenerator.generate(rowBatchWrapper.getBatch());
	this.row = new ColumnarRowData(columnarBatch);
}
 
Example #5
Source File: ORCSchemaUtil.java    From iceberg with Apache License 2.0 6 votes vote down vote up
/**
 * Convert an ORC schema to an Iceberg schema. This method handles the convertion from the original
 * Iceberg column mapping IDs if present in the ORC column attributes, otherwise, ORC column IDs
 * will be assigned following ORCs pre-order ID assignment.
 *
 * @return the Iceberg schema
 */
public static Schema convert(TypeDescription orcSchema) {
  List<TypeDescription> children = orcSchema.getChildren();
  List<String> childrenNames = orcSchema.getFieldNames();
  Preconditions.checkState(children.size() == childrenNames.size(),
      "Error in ORC file, children fields and names do not match.");

  List<Types.NestedField> icebergFields = Lists.newArrayListWithExpectedSize(children.size());
  AtomicInteger lastColumnId = new AtomicInteger(getMaxIcebergId(orcSchema));
  for (int i = 0; i < children.size(); i++) {
    icebergFields.add(convertOrcToIceberg(children.get(i), childrenNames.get(i),
        lastColumnId::incrementAndGet));
  }

  return new Schema(icebergFields);
}
 
Example #6
Source File: TestBuildOrcProjection.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testProjectionPrimitive() {
  Schema originalSchema = new Schema(
      optional(1, "a", Types.IntegerType.get()),
      optional(2, "b", Types.StringType.get())
  );

  // Original mapping (stored in ORC)
  TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema);

  // Evolve schema
  Schema evolveSchema = new Schema(
      optional(2, "a", Types.StringType.get()),
      optional(3, "c", Types.DateType.get())  // will produce ORC column c_r3 (new)
  );

  TypeDescription newOrcSchema = ORCSchemaUtil.buildOrcProjection(evolveSchema, orcSchema);
  assertEquals(2, newOrcSchema.getChildren().size());
  assertEquals(1, newOrcSchema.findSubtype("b").getId());
  assertEquals(TypeDescription.Category.STRING, newOrcSchema.findSubtype("b").getCategory());
  assertEquals(2, newOrcSchema.findSubtype("c_r3").getId());
  assertEquals(TypeDescription.Category.DATE, newOrcSchema.findSubtype("c_r3").getCategory());
}
 
Example #7
Source File: TestBuildOrcProjection.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testProjectionNestedNoOp() {
  Types.StructType nestedStructType = Types.StructType.of(
      optional(2, "b", Types.StringType.get()),
      optional(3, "c", Types.DateType.get())
  );
  Schema originalSchema = new Schema(
      optional(1, "a", nestedStructType)
  );

  // Original mapping (stored in ORC)
  TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema);

  TypeDescription newOrcSchema = ORCSchemaUtil.buildOrcProjection(originalSchema, orcSchema);
  assertEquals(1, newOrcSchema.getChildren().size());
  assertEquals(TypeDescription.Category.STRUCT, newOrcSchema.findSubtype("a").getCategory());
  TypeDescription nestedCol = newOrcSchema.findSubtype("a");
  assertEquals(2, nestedCol.findSubtype("b").getId());
  assertEquals(TypeDescription.Category.STRING, nestedCol.findSubtype("b").getCategory());
  assertEquals(3, nestedCol.findSubtype("c").getId());
  assertEquals(TypeDescription.Category.DATE, nestedCol.findSubtype("c").getCategory());
}
 
Example #8
Source File: JsonORCFileReaderWriterFactory.java    From secor with Apache License 2.0 6 votes vote down vote up
public JsonORCFileWriter(LogFilePath logFilePath, CompressionCodec codec)
        throws IOException {
    Configuration conf = new Configuration();
    Path path = new Path(logFilePath.getLogFilePath());
    schema = schemaProvider.getSchema(logFilePath.getTopic(),
            logFilePath);
    if (schema == null) {
        String topic = logFilePath.getTopic();
        throw new IllegalArgumentException(
            String.format("No schema is provided for topic '%s'", topic));
    }
    List<TypeDescription> fieldTypes = schema.getChildren();
    converters = new JsonConverter[fieldTypes.size()];
    for (int c = 0; c < converters.length; ++c) {
        converters[c] = VectorColumnFiller.createConverter(fieldTypes
                .get(c));
    }

    writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf)
            .compress(resolveCompression(codec)).setSchema(schema));
    batch = schema.createRowBatch();
}
 
Example #9
Source File: OrcRowInputFormat.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Creates an OrcRowInputFormat.
 *
 * @param path The path to read ORC files from.
 * @param orcSchema The schema of the ORC files as ORC TypeDescription.
 * @param orcConfig The configuration to read the ORC files with.
 * @param batchSize The number of Row objects to read in a batch.
 */
public OrcRowInputFormat(String path, TypeDescription orcSchema, Configuration orcConfig, int batchSize) {
	super(new Path(path));

	// configure OrcRowInputFormat
	this.schema = orcSchema;
	this.rowType = (RowTypeInfo) OrcBatchReader.schemaToTypeInfo(schema);
	this.conf = orcConfig;
	this.batchSize = batchSize;

	// set default selection mask, i.e., all fields.
	this.selectedFields = new int[this.schema.getChildren().size()];
	for (int i = 0; i < selectedFields.length; i++) {
		this.selectedFields[i] = i;
	}
}
 
Example #10
Source File: TestBuildOrcProjection.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testEvolutionAddContainerField() {
  Schema baseSchema = new Schema(
      required(1, "a", Types.IntegerType.get())
  );
  TypeDescription baseOrcSchema = ORCSchemaUtil.convert(baseSchema);

  Schema evolvedSchema = new Schema(
      required(1, "a", Types.IntegerType.get()),
      optional(2, "b", Types.StructType.of(
          required(3, "c", Types.LongType.get())
      ))
  );

  TypeDescription newOrcSchema = ORCSchemaUtil.buildOrcProjection(evolvedSchema, baseOrcSchema);
  assertEquals(2, newOrcSchema.getChildren().size());
  assertEquals(TypeDescription.Category.INT, newOrcSchema.findSubtype("a").getCategory());
  assertEquals(2, newOrcSchema.findSubtype("b_r2").getId());
  assertEquals(TypeDescription.Category.STRUCT, newOrcSchema.findSubtype("b_r2").getCategory());
  TypeDescription nestedCol = newOrcSchema.findSubtype("b_r2");
  assertEquals(3, nestedCol.findSubtype("c_r3").getId());
  assertEquals(TypeDescription.Category.LONG, nestedCol.findSubtype("c_r3").getCategory());
}
 
Example #11
Source File: OrcKeyComparatorTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
@Test
public void testSimpleComparator() throws Exception {
  OrcKeyComparator comparator = new OrcKeyComparator();
  Configuration conf = new Configuration();
  String orcSchema = "struct<i:int,j:int>";
  TypeDescription schema = TypeDescription.fromString(orcSchema);
  conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), orcSchema);
  Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), orcSchema);
  comparator.setConf(conf);

  OrcStruct record0 = createSimpleOrcStruct(schema, 1, 2);
  OrcStruct record1 = createSimpleOrcStruct(schema, 3, 4);
  OrcStruct record2 = createSimpleOrcStruct(schema, 3, 4);

  OrcKey orcKey0 = new OrcKey();
  orcKey0.key = record0;
  OrcKey orcKey1 = new OrcKey();
  orcKey1.key = record1;
  OrcKey orcKey2 = new OrcKey();
  orcKey2.key = record2;

  Assert.assertTrue(comparator.compare(orcKey0, orcKey1) < 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey2) == 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey0) > 0);
}
 
Example #12
Source File: ORC.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public OrcIterator build() {
  Preconditions.checkNotNull(schema, "Schema is required");
  try {
    Path path = new Path(file.location());
    Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
    ColumnIdMap columnIds = new ColumnIdMap();
    TypeDescription orcSchema = TypeConversion.toOrc(schema, columnIds);
    Reader.Options options = reader.options();
    if (start != null) {
      options.range(start, length);
    }
    options.schema(orcSchema);
    return new OrcIterator(path, orcSchema, reader.rows(options));
  } catch (IOException e) {
    throw new RuntimeException("Can't open " + file.location(), e);
  }
}
 
Example #13
Source File: OrcShimV200.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Computes the ORC projection mask of the fields to include from the selected fields.rowOrcInputFormat.nextRecord(null).
 *
 * @return The ORC projection mask.
 */
public static boolean[] computeProjectionMask(TypeDescription schema, int[] selectedFields) {
	// mask with all fields of the schema
	boolean[] projectionMask = new boolean[schema.getMaximumId() + 1];
	// for each selected field
	for (int inIdx : selectedFields) {
		// set all nested fields of a selected field to true
		TypeDescription fieldSchema = schema.getChildren().get(inIdx);
		for (int i = fieldSchema.getId(); i <= fieldSchema.getMaximumId(); i++) {
			projectionMask[i] = true;
		}
	}
	return projectionMask;
}
 
Example #14
Source File: SqlInterpreterTest.java    From zeppelin with Apache License 2.0 5 votes vote down vote up
public File createORCFile(int[] values) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".orc");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();
  conf.set("orc.compress", "snappy");
  TypeDescription schema = TypeDescription.fromString("struct<msg:int>");
  Writer writer = OrcFile.createWriter(path,
          OrcFile.writerOptions(conf)
                  .setSchema(schema));
  VectorizedRowBatch batch = schema.createRowBatch();
  LongColumnVector x = (LongColumnVector) batch.cols[0];
  for (int i = 0; i < values.length; ++i) {
    int row = batch.size++;
    x.vector[row] = values[i];
    // If the batch is full, write it out and start over.
    if (batch.size == batch.getMaxSize()) {
      writer.addRowBatch(batch);
      batch.reset();
    }
  }
  if (batch.size != 0) {
    writer.addRowBatch(batch);
    batch.reset();
  }
  writer.close();
  return file;
}
 
Example #15
Source File: ORCSchemaUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static TypeDescription convert(Schema schema) {
  final TypeDescription root = TypeDescription.createStruct();
  final Types.StructType schemaRoot = schema.asStruct();
  for (Types.NestedField field : schemaRoot.asStructType().fields()) {
    TypeDescription orcColumType = convert(field.fieldId(), field.type(), field.isRequired());
    root.addField(field.name(), orcColumType);
  }
  return root;
}
 
Example #16
Source File: OrcKeyComparatorTest.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * Create a {@link OrcList} repeating the given parameter inside the list for multiple times.
 */
private OrcList createOrcList(int element, TypeDescription schema, int num) {
  OrcList result = new OrcList(schema);
  for (int i = 0; i < num; i++) {
    result.add(new IntWritable(element));
  }
  return result;
}
 
Example #17
Source File: VectorColumnFiller.java    From secor with Apache License 2.0 5 votes vote down vote up
public static JsonConverter createConverter(TypeDescription schema) {
    switch (schema.getCategory()) {
    case BYTE:
    case SHORT:
    case INT:
    case LONG:
        return new LongColumnConverter();
    case FLOAT:
    case DOUBLE:
        return new DoubleColumnConverter();
    case CHAR:
    case VARCHAR:
    case STRING:
        return new StringColumnConverter();
    case DECIMAL:
        return new DecimalColumnConverter();
    case TIMESTAMP:
        return new TimestampColumnConverter();
    case BINARY:
        return new BinaryColumnConverter();
    case BOOLEAN:
        return new BooleanColumnConverter();
    case STRUCT:
        return new StructColumnConverter(schema);
    case LIST:
        return new ListColumnConverter(schema);
    case MAP:
        return new MapColumnConverter(schema);
    case UNION:
        return new UnionColumnConverter(schema);
    default:
        throw new IllegalArgumentException("Unhandled type " + schema);
    }
}
 
Example #18
Source File: OrcShim.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Create orc {@link RecordReader} from conf, schema and etc...
 */
RecordReader createRecordReader(
		Configuration conf,
		TypeDescription schema,
		int[] selectedFields,
		List<OrcSplitReader.Predicate> conjunctPredicates,
		org.apache.flink.core.fs.Path path,
		long splitStart,
		long splitLength) throws IOException;
 
Example #19
Source File: OrcMetrics.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private Set<TypeDescription> flatten(TypeDescription rootType) {
  if (rootType == null) {
    return ImmutableSet.of();
  }

  final Set<TypeDescription> flatTypes = Sets.newHashSetWithExpectedSize(rootType.getMaximumId());
  final Queue<TypeDescription> queue = Queues.newLinkedBlockingQueue();
  queue.add(rootType);
  while (!queue.isEmpty()) {
    TypeDescription type = queue.remove();
    flatTypes.add(type);
    queue.addAll(Optional.ofNullable(type.getChildren()).orElse(ImmutableList.of()));
  }
  return flatTypes;
}
 
Example #20
Source File: OrcInputFormat.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Creates an OrcInputFormat.
 *
 * @param path The path to read ORC files from.
 * @param orcSchema The schema of the ORC files as ORC TypeDescription.
 * @param orcConfig The configuration to read the ORC files with.
 * @param batchSize The number of Row objects to read in a batch.
 */
public OrcInputFormat(Path path, TypeDescription orcSchema, Configuration orcConfig, int batchSize) {
	super(path);

	// configure OrcInputFormat
	this.schema = orcSchema;
	this.conf = orcConfig;
	this.batchSize = batchSize;

	// set default selection mask, i.e., all fields.
	this.selectedFields = new int[this.schema.getChildren().size()];
	for (int i = 0; i < selectedFields.length; i++) {
		this.selectedFields[i] = i;
	}
}
 
Example #21
Source File: TestBuildOrcProjection.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testProjectionNested() {
  Types.StructType nestedStructType = Types.StructType.of(
      optional(2, "b", Types.StringType.get()),
      optional(3, "c", Types.DateType.get())
  );
  Schema originalSchema = new Schema(
      optional(1, "a", nestedStructType)
  );

  // Original mapping (stored in ORC)
  TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema);

  // Evolve schema
  Types.StructType newNestedStructType = Types.StructType.of(
      optional(3, "cc", Types.DateType.get()),
      optional(2, "bb", Types.StringType.get())
  );
  Schema evolveSchema = new Schema(
      optional(1, "aa", newNestedStructType)
  );

  TypeDescription newOrcSchema = ORCSchemaUtil.buildOrcProjection(evolveSchema, orcSchema);
  assertEquals(1, newOrcSchema.getChildren().size());
  assertEquals(TypeDescription.Category.STRUCT, newOrcSchema.findSubtype("a").getCategory());
  TypeDescription nestedCol = newOrcSchema.findSubtype("a");
  assertEquals(2, nestedCol.findSubtype("c").getId());
  assertEquals(TypeDescription.Category.DATE, nestedCol.findSubtype("c").getCategory());
  assertEquals(3, nestedCol.findSubtype("b").getId());
  assertEquals(TypeDescription.Category.STRING, nestedCol.findSubtype("b").getCategory());
}
 
Example #22
Source File: OrcConverter.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
public RowMetaAndData convertFromOrc( VectorizedRowBatch batch, int currentBatchRow,
                                      List<? extends IOrcInputField> dialogInputFields,
                                      TypeDescription typeDescription,
                                      Map<String, Integer> schemaToOrcSubcripts,
                                      List<? extends IOrcInputField> orcInputFields ) {
  return convertFromOrc( new RowMetaAndData(), batch, currentBatchRow, dialogInputFields, typeDescription,
    schemaToOrcSubcripts, orcInputFields );
}
 
Example #23
Source File: OrcIterable.java    From iceberg with Apache License 2.0 5 votes vote down vote up
OrcIterable(InputFile file, Configuration config, Schema schema,
            Long start, Long length,
            Function<TypeDescription, OrcRowReader<?>> readerFunction, boolean caseSensitive, Expression filter) {
  this.schema = schema;
  this.readerFunction = readerFunction;
  this.file = file;
  this.start = start;
  this.length = length;
  this.config = config;
  this.caseSensitive = caseSensitive;
  this.filter = (filter == Expressions.alwaysTrue()) ? null : filter;
}
 
Example #24
Source File: SparkOrcWriter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Converter buildConverter(TypeDescription schema) {
  switch (schema.getCategory()) {
    case BOOLEAN:
      return new BooleanConverter();
    case BYTE:
      return new ByteConverter();
    case SHORT:
      return new ShortConverter();
    case DATE:
    case INT:
      return new IntConverter();
    case LONG:
      return new LongConverter();
    case FLOAT:
      return new FloatConverter();
    case DOUBLE:
      return new DoubleConverter();
    case BINARY:
      return new BytesConverter();
    case STRING:
    case CHAR:
    case VARCHAR:
      return new StringConverter();
    case DECIMAL:
      return schema.getPrecision() <= 18 ?
          new Decimal18Converter(schema) :
          new Decimal38Converter(schema);
    case TIMESTAMP_INSTANT:
      return new TimestampTzConverter();
    case STRUCT:
      return new StructConverter(schema);
    case LIST:
      return new ListConverter(schema);
    case MAP:
      return new MapConverter(schema);
  }
  throw new IllegalArgumentException("Unhandled type " + schema);
}
 
Example #25
Source File: ORCSchemaUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static boolean isRequired(TypeDescription orcType) {
  String isRequiredStr = orcType.getAttributeValue(ICEBERG_REQUIRED_ATTRIBUTE);
  if (isRequiredStr != null) {
    return Boolean.parseBoolean(isRequiredStr);
  }
  return false;
}
 
Example #26
Source File: OrcSchemaConverter.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
private int determineMetaType( TypeDescription subDescription ) {
  switch ( subDescription.getCategory().getName() ) {
    case "string":
    case "char":
    case "varchar":
      return ValueMetaInterface.TYPE_STRING;
    case "bigint":
    case "tinyint":
    case "smallint":
    case "int":
      return ValueMetaInterface.TYPE_INTEGER;
    case "double":
    case "float":
      return ValueMetaInterface.TYPE_NUMBER;
    case "decimal":
      return ValueMetaInterface.TYPE_BIGNUMBER;
    case "timestamp":
      return ValueMetaInterface.TYPE_TIMESTAMP;
    case "date":
      return ValueMetaInterface.TYPE_DATE;
    case "boolean":
      return ValueMetaInterface.TYPE_BOOLEAN;
    case "binary":
      return ValueMetaInterface.TYPE_BINARY;
  }
  //if none of the cases match return a -1
  return -1;
}
 
Example #27
Source File: OrcUtils.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * Recursively convert the {@param oldStruct} into {@param newStruct} whose schema is {@param targetSchema}.
 * This serves similar purpose like GenericDatumReader for Avro, which accepts an reader schema and writer schema
 * to allow users convert bytes into reader's schema in a compatible approach.
 * Calling this method SHALL NOT cause any side-effect for {@param oldStruct}, also it will copy value of each fields
 * in {@param oldStruct} into {@param newStruct} recursively. Please ensure avoiding unnecessary call as it could
 * be pretty expensive if the struct schema is complicated, or contains container objects like array/map.
 *
 * Note that if newStruct containing things like List/Map (container-type), the up-conversion is doing two things:
 * 1. Clear all elements in original containers.
 * 2. Make value of container elements in {@param oldStruct} is populated into {@param newStruct} with element-type
 * in {@param newStruct} if compatible.
 *
 * Limitation:
 * 1. Does not support up-conversion of key types in Maps. The underlying reasoning is because of the primary format
 * from upstream is Avro, which enforces key-type to be string only.
 * 2. Conversion from a field A to field B only happens if
 * org.apache.gobblin.compaction.mapreduce.orc.OrcValueMapper#isEvolutionValid(A,B) return true.
 */
@VisibleForTesting
public static void upConvertOrcStruct(OrcStruct oldStruct, OrcStruct newStruct, TypeDescription targetSchema) {

  // If target schema is not equal to newStruct's schema, it is a illegal state and doesn't make sense to work through.
  Preconditions.checkArgument(newStruct.getSchema().equals(targetSchema));

  int indexInNewSchema = 0;
  List<String> oldSchemaFieldNames = oldStruct.getSchema().getFieldNames();
  /* Construct a fieldName -> Index map to efficient access within the loop below. */
  Map<String, Integer> oldSchemaIndex = IntStream.range(0, oldSchemaFieldNames.size()).boxed()
      .collect(Collectors.toMap(oldSchemaFieldNames::get, Function.identity()));
  List<TypeDescription> oldSchemaTypes = oldStruct.getSchema().getChildren();
  List<TypeDescription> newSchemaTypes = targetSchema.getChildren();

  for (String fieldName : targetSchema.getFieldNames()) {
    if (oldSchemaFieldNames.contains(fieldName) && oldStruct.getFieldValue(fieldName) != null) {
      int fieldIndex = oldSchemaIndex.get(fieldName);

      TypeDescription oldFieldSchema = oldSchemaTypes.get(fieldIndex);
      TypeDescription newFieldSchema = newSchemaTypes.get(indexInNewSchema);

      if (isEvolutionValid(oldFieldSchema, newFieldSchema)) {
        WritableComparable oldField = oldStruct.getFieldValue(fieldName);
        WritableComparable newField = newStruct.getFieldValue(fieldName);
        newField = (newField == null) ? OrcUtils.createValueRecursively(newFieldSchema) : newField;
        newStruct.setFieldValue(fieldName, structConversionHelper(oldField, newField, newFieldSchema));
      } else {
        throw new SchemaEvolution.IllegalEvolutionException(String
            .format("ORC does not support type conversion from file" + " type %s to reader type %s ",
                oldFieldSchema.toString(), newFieldSchema.toString()));
      }
    } else {
      newStruct.setFieldValue(fieldName, null);
    }

    indexInNewSchema++;
  }
}
 
Example #28
Source File: ORCSchemaUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static boolean isSameType(TypeDescription orcType, Type icebergType) {
  if (icebergType.typeId() == Type.TypeID.TIMESTAMP) {
    Types.TimestampType tsType = (Types.TimestampType) icebergType;
    return Objects.equals(
        tsType.shouldAdjustToUTC() ? TypeDescription.Category.TIMESTAMP_INSTANT : TypeDescription.Category.TIMESTAMP,
        orcType.getCategory());
  } else {
    return Objects.equals(TYPE_MAPPING.get(icebergType.typeId()), orcType.getCategory());
  }
}
 
Example #29
Source File: ORCSchemaUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Map<Integer, OrcField> icebergToOrcMapping(String name, TypeDescription orcType) {
  Map<Integer, OrcField> icebergToOrc = Maps.newHashMap();
  switch (orcType.getCategory()) {
    case STRUCT:
      List<String> childrenNames = orcType.getFieldNames();
      List<TypeDescription> children = orcType.getChildren();
      for (int i = 0; i < children.size(); i++) {
        icebergToOrc.putAll(icebergToOrcMapping(childrenNames.get(i), children.get(i)));
      }
      break;
    case LIST:
      icebergToOrc.putAll(icebergToOrcMapping("element", orcType.getChildren().get(0)));
      break;
    case MAP:
      icebergToOrc.putAll(icebergToOrcMapping("key", orcType.getChildren().get(0)));
      icebergToOrc.putAll(icebergToOrcMapping("value", orcType.getChildren().get(1)));
      break;
  }

  if (orcType.getId() > 0) {
    // Only add to non-root types.
    icebergID(orcType)
        .ifPresent(integer -> icebergToOrc.put(integer, new OrcField(name, orcType)));
  }

  return icebergToOrc;
}
 
Example #30
Source File: OrcBatchReader.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Fills an ORC batch into an array of Row.
 *
 * @param rows The batch of rows need to be filled.
 * @param schema The schema of the ORC data.
 * @param batch The ORC data.
 * @param selectedFields The list of selected ORC fields.
 * @return The number of rows that were filled.
 */
static int fillRows(Row[] rows, TypeDescription schema, VectorizedRowBatch batch, int[] selectedFields) {

	int rowsToRead = Math.min((int) batch.count(), rows.length);

	List<TypeDescription> fieldTypes = schema.getChildren();
	// read each selected field
	for (int fieldIdx = 0; fieldIdx < selectedFields.length; fieldIdx++) {
		int orcIdx = selectedFields[fieldIdx];
		readField(rows, fieldIdx, fieldTypes.get(orcIdx), batch.cols[orcIdx], rowsToRead);
	}
	return rowsToRead;
}