Java Code Examples for org.apache.orc.TypeDescription

The following examples show how to use org.apache.orc.TypeDescription. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: incubator-gobblin   Source File: OrcKeyComparatorTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testSimpleComparator() throws Exception {
  OrcKeyComparator comparator = new OrcKeyComparator();
  Configuration conf = new Configuration();
  String orcSchema = "struct<i:int,j:int>";
  TypeDescription schema = TypeDescription.fromString(orcSchema);
  conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), orcSchema);
  Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), orcSchema);
  comparator.setConf(conf);

  OrcStruct record0 = createSimpleOrcStruct(schema, 1, 2);
  OrcStruct record1 = createSimpleOrcStruct(schema, 3, 4);
  OrcStruct record2 = createSimpleOrcStruct(schema, 3, 4);

  OrcKey orcKey0 = new OrcKey();
  orcKey0.key = record0;
  OrcKey orcKey1 = new OrcKey();
  orcKey1.key = record1;
  OrcKey orcKey2 = new OrcKey();
  orcKey2.key = record2;

  Assert.assertTrue(comparator.compare(orcKey0, orcKey1) < 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey2) == 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey0) > 0);
}
 
Example 2
Source Project: secor   Source File: JsonORCFileReaderWriterFactory.java    License: Apache License 2.0 6 votes vote down vote up
public JsonORCFileWriter(LogFilePath logFilePath, CompressionCodec codec)
        throws IOException {
    Configuration conf = new Configuration();
    Path path = new Path(logFilePath.getLogFilePath());
    schema = schemaProvider.getSchema(logFilePath.getTopic(),
            logFilePath);
    if (schema == null) {
        String topic = logFilePath.getTopic();
        throw new IllegalArgumentException(
            String.format("No schema is provided for topic '%s'", topic));
    }
    List<TypeDescription> fieldTypes = schema.getChildren();
    converters = new JsonConverter[fieldTypes.size()];
    for (int c = 0; c < converters.length; ++c) {
        converters[c] = VectorColumnFiller.createConverter(fieldTypes
                .get(c));
    }

    writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf)
            .compress(resolveCompression(codec)).setSchema(schema));
    batch = schema.createRowBatch();
}
 
Example 3
Source Project: Flink-CEPplus   Source File: OrcRowInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("unchecked")
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
	batchSize = in.readInt();
	org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration();
	configuration.readFields(in);

	if (this.conf == null) {
		this.conf = configuration;
	}
	this.schema = TypeDescription.fromString(in.readUTF());

	this.selectedFields = new int[in.readInt()];
	for (int i = 0; i < selectedFields.length; i++) {
		this.selectedFields[i] = in.readInt();
	}

	this.conjunctPredicates = new ArrayList<>();
	int numPreds = in.readInt();
	for (int i = 0; i < numPreds; i++) {
		conjunctPredicates.add((Predicate) in.readObject());
	}
}
 
Example 4
Source Project: flink   Source File: OrcColumnarRowSplitReader.java    License: Apache License 2.0 6 votes vote down vote up
public OrcColumnarRowSplitReader(
		OrcShim<BATCH> shim,
		Configuration conf,
		TypeDescription schema,
		int[] selectedFields,
		ColumnBatchGenerator<BATCH> batchGenerator,
		List<Predicate> conjunctPredicates,
		int batchSize,
		Path path,
		long splitStart,
		long splitLength) throws IOException {
	super(
			shim,
			conf,
			schema,
			selectedFields,
			conjunctPredicates,
			batchSize,
			path,
			splitStart,
			splitLength);

	this.columnarBatch = batchGenerator.generate(rowBatchWrapper.getBatch());
	this.row = new ColumnarRowData(columnarBatch);
}
 
Example 5
Source Project: flink   Source File: OrcRowInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Creates an OrcRowInputFormat.
 *
 * @param path The path to read ORC files from.
 * @param orcSchema The schema of the ORC files as ORC TypeDescription.
 * @param orcConfig The configuration to read the ORC files with.
 * @param batchSize The number of Row objects to read in a batch.
 */
public OrcRowInputFormat(String path, TypeDescription orcSchema, Configuration orcConfig, int batchSize) {
	super(new Path(path));

	// configure OrcRowInputFormat
	this.schema = orcSchema;
	this.rowType = (RowTypeInfo) OrcBatchReader.schemaToTypeInfo(schema);
	this.conf = orcConfig;
	this.batchSize = batchSize;

	// set default selection mask, i.e., all fields.
	this.selectedFields = new int[this.schema.getChildren().size()];
	for (int i = 0; i < selectedFields.length; i++) {
		this.selectedFields[i] = i;
	}
}
 
Example 6
Source Project: iceberg   Source File: ORCSchemaUtil.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Convert an ORC schema to an Iceberg schema. This method handles the convertion from the original
 * Iceberg column mapping IDs if present in the ORC column attributes, otherwise, ORC column IDs
 * will be assigned following ORCs pre-order ID assignment.
 *
 * @return the Iceberg schema
 */
public static Schema convert(TypeDescription orcSchema) {
  List<TypeDescription> children = orcSchema.getChildren();
  List<String> childrenNames = orcSchema.getFieldNames();
  Preconditions.checkState(children.size() == childrenNames.size(),
      "Error in ORC file, children fields and names do not match.");

  List<Types.NestedField> icebergFields = Lists.newArrayListWithExpectedSize(children.size());
  AtomicInteger lastColumnId = new AtomicInteger(getMaxIcebergId(orcSchema));
  for (int i = 0; i < children.size(); i++) {
    icebergFields.add(convertOrcToIceberg(children.get(i), childrenNames.get(i),
        lastColumnId::incrementAndGet));
  }

  return new Schema(icebergFields);
}
 
Example 7
Source Project: iceberg   Source File: OrcSchemaWithTypeVisitor.java    License: Apache License 2.0 6 votes vote down vote up
public static <T> T visit(Type iType, TypeDescription schema, OrcSchemaWithTypeVisitor<T> visitor) {
  switch (schema.getCategory()) {
    case STRUCT:
      return visitRecord(iType != null ? iType.asStructType() : null, schema, visitor);

    case UNION:
      throw new UnsupportedOperationException("Cannot handle " + schema);

    case LIST:
      Types.ListType list = iType != null ? iType.asListType() : null;
      return visitor.list(
          list, schema,
          visit(list.elementType(), schema.getChildren().get(0), visitor));

    case MAP:
      Types.MapType map = iType != null ? iType.asMapType() : null;
      return visitor.map(
          map, schema,
          visit(map != null ? map.keyType() : null, schema.getChildren().get(0), visitor),
          visit(map != null ? map.valueType() : null, schema.getChildren().get(1), visitor));

    default:
      return visitor.primitive(iType != null ? iType.asPrimitiveType() : null, schema);
  }
}
 
Example 8
Source Project: iceberg   Source File: TestBuildOrcProjection.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testProjectionPrimitive() {
  Schema originalSchema = new Schema(
      optional(1, "a", Types.IntegerType.get()),
      optional(2, "b", Types.StringType.get())
  );

  // Original mapping (stored in ORC)
  TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema);

  // Evolve schema
  Schema evolveSchema = new Schema(
      optional(2, "a", Types.StringType.get()),
      optional(3, "c", Types.DateType.get())  // will produce ORC column c_r3 (new)
  );

  TypeDescription newOrcSchema = ORCSchemaUtil.buildOrcProjection(evolveSchema, orcSchema);
  assertEquals(2, newOrcSchema.getChildren().size());
  assertEquals(1, newOrcSchema.findSubtype("b").getId());
  assertEquals(TypeDescription.Category.STRING, newOrcSchema.findSubtype("b").getCategory());
  assertEquals(2, newOrcSchema.findSubtype("c_r3").getId());
  assertEquals(TypeDescription.Category.DATE, newOrcSchema.findSubtype("c_r3").getCategory());
}
 
Example 9
Source Project: iceberg   Source File: TestBuildOrcProjection.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testProjectionNestedNoOp() {
  Types.StructType nestedStructType = Types.StructType.of(
      optional(2, "b", Types.StringType.get()),
      optional(3, "c", Types.DateType.get())
  );
  Schema originalSchema = new Schema(
      optional(1, "a", nestedStructType)
  );

  // Original mapping (stored in ORC)
  TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema);

  TypeDescription newOrcSchema = ORCSchemaUtil.buildOrcProjection(originalSchema, orcSchema);
  assertEquals(1, newOrcSchema.getChildren().size());
  assertEquals(TypeDescription.Category.STRUCT, newOrcSchema.findSubtype("a").getCategory());
  TypeDescription nestedCol = newOrcSchema.findSubtype("a");
  assertEquals(2, nestedCol.findSubtype("b").getId());
  assertEquals(TypeDescription.Category.STRING, nestedCol.findSubtype("b").getCategory());
  assertEquals(3, nestedCol.findSubtype("c").getId());
  assertEquals(TypeDescription.Category.DATE, nestedCol.findSubtype("c").getCategory());
}
 
Example 10
Source Project: iceberg   Source File: TestBuildOrcProjection.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testEvolutionAddContainerField() {
  Schema baseSchema = new Schema(
      required(1, "a", Types.IntegerType.get())
  );
  TypeDescription baseOrcSchema = ORCSchemaUtil.convert(baseSchema);

  Schema evolvedSchema = new Schema(
      required(1, "a", Types.IntegerType.get()),
      optional(2, "b", Types.StructType.of(
          required(3, "c", Types.LongType.get())
      ))
  );

  TypeDescription newOrcSchema = ORCSchemaUtil.buildOrcProjection(evolvedSchema, baseOrcSchema);
  assertEquals(2, newOrcSchema.getChildren().size());
  assertEquals(TypeDescription.Category.INT, newOrcSchema.findSubtype("a").getCategory());
  assertEquals(2, newOrcSchema.findSubtype("b_r2").getId());
  assertEquals(TypeDescription.Category.STRUCT, newOrcSchema.findSubtype("b_r2").getCategory());
  TypeDescription nestedCol = newOrcSchema.findSubtype("b_r2");
  assertEquals(3, nestedCol.findSubtype("c_r3").getId());
  assertEquals(TypeDescription.Category.LONG, nestedCol.findSubtype("c_r3").getCategory());
}
 
Example 11
Source Project: iceberg   Source File: ORC.java    License: Apache License 2.0 6 votes vote down vote up
public OrcIterator build() {
  Preconditions.checkNotNull(schema, "Schema is required");
  try {
    Path path = new Path(file.location());
    Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
    ColumnIdMap columnIds = new ColumnIdMap();
    TypeDescription orcSchema = TypeConversion.toOrc(schema, columnIds);
    Reader.Options options = reader.options();
    if (start != null) {
      options.range(start, length);
    }
    options.schema(orcSchema);
    return new OrcIterator(path, orcSchema, reader.rows(options));
  } catch (IOException e) {
    throw new RuntimeException("Can't open " + file.location(), e);
  }
}
 
Example 12
Source Project: iceberg   Source File: TestORCSchemaUtil.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testInvalidTypePromotions() {
  Schema originalSchema = new Schema(
      optional(1, "a", Types.LongType.get())
  );

  TypeDescription orcSchema = ORCSchemaUtil.convert(originalSchema);
  Schema evolveSchema = new Schema(
      optional(1, "a", Types.IntegerType.get())
  );

  assertThrows("Should not allow invalid type promotion",
      IllegalArgumentException.class, "Can not promote", () -> {
        ORCSchemaUtil.buildOrcProjection(evolveSchema, orcSchema);
      });
}
 
Example 13
Source Project: Flink-CEPplus   Source File: OrcBatchReader.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Fills an ORC batch into an array of Row.
 *
 * @param rows The batch of rows need to be filled.
 * @param schema The schema of the ORC data.
 * @param batch The ORC data.
 * @param selectedFields The list of selected ORC fields.
 * @return The number of rows that were filled.
 */
static int fillRows(Row[] rows, TypeDescription schema, VectorizedRowBatch batch, int[] selectedFields) {

	int rowsToRead = Math.min((int) batch.count(), rows.length);

	List<TypeDescription> fieldTypes = schema.getChildren();
	// read each selected field
	for (int fieldIdx = 0; fieldIdx < selectedFields.length; fieldIdx++) {
		int orcIdx = selectedFields[fieldIdx];
		readField(rows, fieldIdx, fieldTypes.get(orcIdx), batch.cols[orcIdx], rowsToRead);
	}
	return rowsToRead;
}
 
Example 14
Source Project: secor   Source File: VectorColumnFiller.java    License: Apache License 2.0 5 votes vote down vote up
public UnionColumnConverter(TypeDescription schema) {
    List<TypeDescription> children = schema.getChildren();
    int index = 0;
    for (TypeDescription childType : children) {
        JsonType jsonType = getJsonType(childType.getCategory());
        JsonConverter converter = createConverter(childType);
        // FIXME: Handle cases where childConverters is pre-occupied with the same mask
        childConverters.put(jsonType, new ConverterInfo(index++, converter));
    }
}
 
Example 15
Source Project: Flink-CEPplus   Source File: OrcRowInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Computes the ORC projection mask of the fields to include from the selected fields.rowOrcInputFormat.nextRecord(null).
 *
 * @return The ORC projection mask.
 */
private boolean[] computeProjectionMask() {
	// mask with all fields of the schema
	boolean[] projectionMask = new boolean[schema.getMaximumId() + 1];
	// for each selected field
	for (int inIdx : selectedFields) {
		// set all nested fields of a selected field to true
		TypeDescription fieldSchema = schema.getChildren().get(inIdx);
		for (int i = fieldSchema.getId(); i <= fieldSchema.getMaximumId(); i++) {
			projectionMask[i] = true;
		}
	}
	return projectionMask;
}
 
Example 16
Source Project: flink   Source File: OrcBatchReader.java    License: Apache License 2.0 5 votes vote down vote up
private static void readNonNullStructColumn(Object[] vals, int fieldIdx, StructColumnVector structVector, TypeDescription schema, int childCount) {

		List<TypeDescription> childrenTypes = schema.getChildren();

		int numFields = childrenTypes.size();
		// create a batch of Rows to read the structs
		Row[] structs = new Row[childCount];
		// TODO: possible improvement: reuse existing Row objects
		for (int i = 0; i < childCount; i++) {
			structs[i] = new Row(numFields);
		}

		// read struct fields
		// we don't have to handle isRepeating because ORC assumes that it is propagated into the children.
		for (int i = 0; i < numFields; i++) {
			readField(structs, i, childrenTypes.get(i), structVector.fields[i], childCount);
		}

		if (fieldIdx == -1) { // set struct as an object
			System.arraycopy(structs, 0, vals, 0, childCount);
		} else { // set struct as a field of Row
			Row[] rows = (Row[]) vals;
			for (int i = 0; i < childCount; i++) {
				rows[i].setField(fieldIdx, structs[i]);
			}
		}
	}
 
Example 17
Source Project: flink   Source File: OrcFileSystemFormatFactory.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public Optional<BulkWriter.Factory<RowData>> createBulkWriterFactory(WriterContext context) {
	LogicalType[] orcTypes = Arrays.stream(context.getFormatFieldTypes())
			.map(DataType::getLogicalType)
			.toArray(LogicalType[]::new);

	TypeDescription typeDescription = OrcSplitReaderUtil.logicalTypeToOrcType(
			RowType.of(orcTypes, context.getFormatFieldNames()));

	OrcBulkWriterFactory<RowData> factory = new OrcBulkWriterFactory<>(
			new RowDataVectorizer(typeDescription.toString(), orcTypes),
			getOrcProperties(context.getFormatOptions()),
			new Configuration());
	return Optional.of(factory);
}
 
Example 18
Source Project: tajo   Source File: OrcUtils.java    License: Apache License 2.0 5 votes vote down vote up
public static TypeDescription convertTypeInfo(TypeDesc desc) {
  switch (desc.getDataType().getType()) {
    case BOOLEAN:
      return TypeDescription.createBoolean();
    case BIT:
      return TypeDescription.createByte();
    case INT2:
      return TypeDescription.createShort();
    case INT4:
      return TypeDescription.createInt();
    case INT8:
      return TypeDescription.createLong();
    case FLOAT4:
      return TypeDescription.createFloat();
    case FLOAT8:
      return TypeDescription.createDouble();
    case TEXT:
      return TypeDescription.createString();
    case DATE:
      return TypeDescription.createDate();
    case TIMESTAMP:
      return TypeDescription.createTimestamp();
    case BLOB:
      return TypeDescription.createBinary();
    case CHAR:
      return TypeDescription.createChar()
          .withMaxLength(desc.getDataType().getLength());
    case RECORD: {
      TypeDescription result = TypeDescription.createStruct();
      for (Column eachColumn : desc.getNestedSchema().getRootColumns()) {
        result.addField(eachColumn.getQualifiedName(),
            convertTypeInfo(eachColumn.getTypeDesc()));
      }
      return result;
    }
    default:
      throw new TajoRuntimeException(new UnsupportedDataTypeException(desc.getDataType().getType().name()));
  }
}
 
Example 19
Source Project: iceberg   Source File: SparkOrcWriter.java    License: Apache License 2.0 5 votes vote down vote up
private static Converter[] buildConverters(TypeDescription schema) {
  if (schema.getCategory() != TypeDescription.Category.STRUCT) {
    throw new IllegalArgumentException("Top level must be a struct " + schema);
  }
  List<TypeDescription> children = schema.getChildren();
  Converter[] result = new Converter[children.size()];
  for(int c=0; c < children.size(); ++c) {
    result[c] = buildConverter(children.get(c));
  }
  return result;
}
 
Example 20
Source Project: incubator-gobblin   Source File: OrcValueMapperTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testIsEvolutionValid() {
  TypeDescription schema_1 = TypeDescription.fromString("struct<i:int,j:int,k:int>");
  TypeDescription schema_2 = TypeDescription.fromString("struct<i:int,j:int,k:bigint>");
  TypeDescription schema_3 = TypeDescription.fromString("struct<i:int,j:int,k:tinyint>");
  TypeDescription schema_4 = TypeDescription.fromString("struct<i:int,j:int>");
  Assert.assertTrue(OrcUtils.isEvolutionValid(schema_1, schema_2));
  Assert.assertTrue(OrcUtils.isEvolutionValid(schema_1, schema_3));
  Assert.assertTrue(OrcUtils.isEvolutionValid(schema_1, schema_4));
  Assert.assertTrue(OrcUtils.isEvolutionValid(schema_4, schema_1));
}
 
Example 21
Source Project: flink   Source File: OrcRowInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Computes the ORC projection mask of the fields to include from the selected fields.rowOrcInputFormat.nextRecord(null).
 *
 * @return The ORC projection mask.
 */
private boolean[] computeProjectionMask() {
	// mask with all fields of the schema
	boolean[] projectionMask = new boolean[schema.getMaximumId() + 1];
	// for each selected field
	for (int inIdx : selectedFields) {
		// set all nested fields of a selected field to true
		TypeDescription fieldSchema = schema.getChildren().get(inIdx);
		for (int i = fieldSchema.getId(); i <= fieldSchema.getMaximumId(); i++) {
			projectionMask[i] = true;
		}
	}
	return projectionMask;
}
 
Example 22
Source Project: incubator-gobblin   Source File: OrcUtilsTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Just a sanity test for column project, should be no difference from other cases when provided reader schema.
 */
@Test
public void testOrcStructProjection() throws Exception {
  TypeDescription originalSchema = TypeDescription.fromString("struct<a:struct<a:int,b:int>,b:struct<c:int,d:int>,c:int>");
  OrcStruct originalStruct = (OrcStruct) OrcUtils.createValueRecursively(originalSchema);
  OrcTestUtils.fillOrcStructWithFixedValue(originalStruct, originalSchema, intValue, stringValue, boolValue);

  TypeDescription projectedSchema = TypeDescription.fromString("struct<a:struct<b:int>,b:struct<c:int>>");
  OrcStruct projectedStructExpectedValue = (OrcStruct) OrcUtils.createValueRecursively(projectedSchema);
  OrcTestUtils
      .fillOrcStructWithFixedValue(projectedStructExpectedValue, projectedSchema, intValue, stringValue, boolValue);
  OrcStruct projectColumnStruct = (OrcStruct) OrcUtils.createValueRecursively(projectedSchema);
  OrcUtils.upConvertOrcStruct(originalStruct, projectColumnStruct, projectedSchema);
  Assert.assertEquals(projectColumnStruct, projectedStructExpectedValue);
}
 
Example 23
Source Project: iceberg   Source File: ORCSchemaUtil.java    License: Apache License 2.0 5 votes vote down vote up
public static TypeDescription convert(Schema schema) {
  final TypeDescription root = TypeDescription.createStruct();
  final Types.StructType schemaRoot = schema.asStruct();
  for (Types.NestedField field : schemaRoot.asStructType().fields()) {
    TypeDescription orcColumType = convert(field.fieldId(), field.type(), field.isRequired());
    root.addField(field.name(), orcColumType);
  }
  return root;
}
 
Example 24
Source Project: iceberg   Source File: ORCSchemaUtil.java    License: Apache License 2.0 5 votes vote down vote up
private static Map<Integer, OrcField> icebergToOrcMapping(String name, TypeDescription orcType) {
  Map<Integer, OrcField> icebergToOrc = Maps.newHashMap();
  switch (orcType.getCategory()) {
    case STRUCT:
      List<String> childrenNames = orcType.getFieldNames();
      List<TypeDescription> children = orcType.getChildren();
      for (int i = 0; i < children.size(); i++) {
        icebergToOrc.putAll(icebergToOrcMapping(childrenNames.get(i), children.get(i)));
      }
      break;
    case LIST:
      icebergToOrc.putAll(icebergToOrcMapping("element", orcType.getChildren().get(0)));
      break;
    case MAP:
      icebergToOrc.putAll(icebergToOrcMapping("key", orcType.getChildren().get(0)));
      icebergToOrc.putAll(icebergToOrcMapping("value", orcType.getChildren().get(1)));
      break;
  }

  if (orcType.getId() > 0) {
    // Only add to non-root types.
    icebergID(orcType)
        .ifPresent(integer -> icebergToOrc.put(integer, new OrcField(name, orcType)));
  }

  return icebergToOrc;
}
 
Example 25
Source Project: iceberg   Source File: ORCSchemaUtil.java    License: Apache License 2.0 5 votes vote down vote up
private static boolean isSameType(TypeDescription orcType, Type icebergType) {
  if (icebergType.typeId() == Type.TypeID.TIMESTAMP) {
    Types.TimestampType tsType = (Types.TimestampType) icebergType;
    return Objects.equals(
        tsType.shouldAdjustToUTC() ? TypeDescription.Category.TIMESTAMP_INSTANT : TypeDescription.Category.TIMESTAMP,
        orcType.getCategory());
  } else {
    return Objects.equals(TYPE_MAPPING.get(icebergType.typeId()), orcType.getCategory());
  }
}
 
Example 26
Source Project: incubator-gobblin   Source File: OrcUtils.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Recursively convert the {@param oldStruct} into {@param newStruct} whose schema is {@param targetSchema}.
 * This serves similar purpose like GenericDatumReader for Avro, which accepts an reader schema and writer schema
 * to allow users convert bytes into reader's schema in a compatible approach.
 * Calling this method SHALL NOT cause any side-effect for {@param oldStruct}, also it will copy value of each fields
 * in {@param oldStruct} into {@param newStruct} recursively. Please ensure avoiding unnecessary call as it could
 * be pretty expensive if the struct schema is complicated, or contains container objects like array/map.
 *
 * Note that if newStruct containing things like List/Map (container-type), the up-conversion is doing two things:
 * 1. Clear all elements in original containers.
 * 2. Make value of container elements in {@param oldStruct} is populated into {@param newStruct} with element-type
 * in {@param newStruct} if compatible.
 *
 * Limitation:
 * 1. Does not support up-conversion of key types in Maps. The underlying reasoning is because of the primary format
 * from upstream is Avro, which enforces key-type to be string only.
 * 2. Conversion from a field A to field B only happens if
 * org.apache.gobblin.compaction.mapreduce.orc.OrcValueMapper#isEvolutionValid(A,B) return true.
 */
@VisibleForTesting
public static void upConvertOrcStruct(OrcStruct oldStruct, OrcStruct newStruct, TypeDescription targetSchema) {

  // If target schema is not equal to newStruct's schema, it is a illegal state and doesn't make sense to work through.
  Preconditions.checkArgument(newStruct.getSchema().equals(targetSchema));

  int indexInNewSchema = 0;
  List<String> oldSchemaFieldNames = oldStruct.getSchema().getFieldNames();
  /* Construct a fieldName -> Index map to efficient access within the loop below. */
  Map<String, Integer> oldSchemaIndex = IntStream.range(0, oldSchemaFieldNames.size()).boxed()
      .collect(Collectors.toMap(oldSchemaFieldNames::get, Function.identity()));
  List<TypeDescription> oldSchemaTypes = oldStruct.getSchema().getChildren();
  List<TypeDescription> newSchemaTypes = targetSchema.getChildren();

  for (String fieldName : targetSchema.getFieldNames()) {
    if (oldSchemaFieldNames.contains(fieldName) && oldStruct.getFieldValue(fieldName) != null) {
      int fieldIndex = oldSchemaIndex.get(fieldName);

      TypeDescription oldFieldSchema = oldSchemaTypes.get(fieldIndex);
      TypeDescription newFieldSchema = newSchemaTypes.get(indexInNewSchema);

      if (isEvolutionValid(oldFieldSchema, newFieldSchema)) {
        WritableComparable oldField = oldStruct.getFieldValue(fieldName);
        WritableComparable newField = newStruct.getFieldValue(fieldName);
        newField = (newField == null) ? OrcUtils.createValueRecursively(newFieldSchema) : newField;
        newStruct.setFieldValue(fieldName, structConversionHelper(oldField, newField, newFieldSchema));
      } else {
        throw new SchemaEvolution.IllegalEvolutionException(String
            .format("ORC does not support type conversion from file" + " type %s to reader type %s ",
                oldFieldSchema.toString(), newFieldSchema.toString()));
      }
    } else {
      newStruct.setFieldValue(fieldName, null);
    }

    indexInNewSchema++;
  }
}
 
Example 27
Source Project: pentaho-hadoop-shims   Source File: OrcSchemaConverter.java    License: Apache License 2.0 5 votes vote down vote up
private int determineMetaType( TypeDescription subDescription ) {
  switch ( subDescription.getCategory().getName() ) {
    case "string":
    case "char":
    case "varchar":
      return ValueMetaInterface.TYPE_STRING;
    case "bigint":
    case "tinyint":
    case "smallint":
    case "int":
      return ValueMetaInterface.TYPE_INTEGER;
    case "double":
    case "float":
      return ValueMetaInterface.TYPE_NUMBER;
    case "decimal":
      return ValueMetaInterface.TYPE_BIGNUMBER;
    case "timestamp":
      return ValueMetaInterface.TYPE_TIMESTAMP;
    case "date":
      return ValueMetaInterface.TYPE_DATE;
    case "boolean":
      return ValueMetaInterface.TYPE_BOOLEAN;
    case "binary":
      return ValueMetaInterface.TYPE_BINARY;
  }
  //if none of the cases match return a -1
  return -1;
}
 
Example 28
Source Project: iceberg   Source File: ORCSchemaUtil.java    License: Apache License 2.0 5 votes vote down vote up
private static boolean isRequired(TypeDescription orcType) {
  String isRequiredStr = orcType.getAttributeValue(ICEBERG_REQUIRED_ATTRIBUTE);
  if (isRequiredStr != null) {
    return Boolean.parseBoolean(isRequiredStr);
  }
  return false;
}
 
Example 29
Source Project: iceberg   Source File: SparkOrcWriter.java    License: Apache License 2.0 5 votes vote down vote up
private static Converter buildConverter(TypeDescription schema) {
  switch (schema.getCategory()) {
    case BOOLEAN:
      return new BooleanConverter();
    case BYTE:
      return new ByteConverter();
    case SHORT:
      return new ShortConverter();
    case DATE:
    case INT:
      return new IntConverter();
    case LONG:
      return new LongConverter();
    case FLOAT:
      return new FloatConverter();
    case DOUBLE:
      return new DoubleConverter();
    case BINARY:
      return new BytesConverter();
    case STRING:
    case CHAR:
    case VARCHAR:
      return new StringConverter();
    case DECIMAL:
      return schema.getPrecision() <= 18 ?
          new Decimal18Converter(schema) :
          new Decimal38Converter(schema);
    case TIMESTAMP_INSTANT:
      return new TimestampTzConverter();
    case STRUCT:
      return new StructConverter(schema);
    case LIST:
      return new ListConverter(schema);
    case MAP:
      return new MapConverter(schema);
  }
  throw new IllegalArgumentException("Unhandled type " + schema);
}
 
Example 30
Source Project: iceberg   Source File: OrcMetrics.java    License: Apache License 2.0 5 votes vote down vote up
private Set<TypeDescription> flatten(TypeDescription rootType) {
  if (rootType == null) {
    return ImmutableSet.of();
  }

  final Set<TypeDescription> flatTypes = Sets.newHashSetWithExpectedSize(rootType.getMaximumId());
  final Queue<TypeDescription> queue = Queues.newLinkedBlockingQueue();
  queue.add(rootType);
  while (!queue.isEmpty()) {
    TypeDescription type = queue.remove();
    flatTypes.add(type);
    queue.addAll(Optional.ofNullable(type.getChildren()).orElse(ImmutableList.of()));
  }
  return flatTypes;
}