org.apache.orc.OrcConf Java Exaples

Source File: TestCachingOrcDataSource.java From presto with Apache License 2.0

6 votes

private static FileSinkOperator.RecordWriter createOrcRecordWriter(File outputFile, Format format, CompressionKind compression, ObjectInspector columnObjectInspector)
        throws IOException
{
    JobConf jobConf = new JobConf();
    OrcConf.WRITE_FORMAT.setString(jobConf, format == ORC_12 ? "0.12" : "0.11");
    OrcConf.COMPRESS.setString(jobConf, compression.name());

    Properties tableProperties = new Properties();
    tableProperties.setProperty(IOConstants.COLUMNS, "test");
    tableProperties.setProperty(IOConstants.COLUMNS_TYPES, columnObjectInspector.getTypeName());
    tableProperties.setProperty(OrcConf.STRIPE_SIZE.getAttribute(), "120000");

    return new OrcOutputFormat().getHiveRecordWriter(
            jobConf,
            new Path(outputFile.toURI()),
            Text.class,
            compression != NONE,
            tableProperties,
            () -> {});
}

Source File: OrcKeyComparatorTest.java From incubator-gobblin with Apache License 2.0

6 votes

@Test
public void testSimpleComparator() throws Exception {
  OrcKeyComparator comparator = new OrcKeyComparator();
  Configuration conf = new Configuration();
  String orcSchema = "struct<i:int,j:int>";
  TypeDescription schema = TypeDescription.fromString(orcSchema);
  conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), orcSchema);
  Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), orcSchema);
  comparator.setConf(conf);

  OrcStruct record0 = createSimpleOrcStruct(schema, 1, 2);
  OrcStruct record1 = createSimpleOrcStruct(schema, 3, 4);
  OrcStruct record2 = createSimpleOrcStruct(schema, 3, 4);

  OrcKey orcKey0 = new OrcKey();
  orcKey0.key = record0;
  OrcKey orcKey1 = new OrcKey();
  orcKey1.key = record1;
  OrcKey orcKey2 = new OrcKey();
  orcKey2.key = record2;

  Assert.assertTrue(comparator.compare(orcKey0, orcKey1) < 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey2) == 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey0) > 0);
}

Source File: ORCAppender.java From tajo with Apache License 2.0

6 votes

private static CompressionKind getCompressionKind(TableMeta meta) {
  String kindstr = meta.getProperty(OrcConf.COMPRESS.getAttribute(),
      String.valueOf(OrcConf.COMPRESS.getDefaultValue()));

  if (kindstr.equalsIgnoreCase(CompressionKind.ZLIB.name())) {
    return CompressionKind.ZLIB;
  }

  if (kindstr.equalsIgnoreCase(CompressionKind.SNAPPY.name())) {
    return CompressionKind.SNAPPY;
  }

  if (kindstr.equalsIgnoreCase(CompressionKind.LZO.name())) {
    return CompressionKind.LZO;
  }

  return CompressionKind.NONE;
}

Source File: ORCAppender.java From tajo with Apache License 2.0

6 votes

private static OrcFile.WriterOptions buildWriterOptions(Configuration conf, TableMeta meta, Schema schema) {
  return OrcFile.writerOptions(conf)
      .setSchema(OrcUtils.convertSchema(schema))
      .compress(getCompressionKind(meta))
      .stripeSize(Long.parseLong(meta.getProperty(OrcConf.STRIPE_SIZE.getAttribute(),
          String.valueOf(OrcConf.STRIPE_SIZE.getDefaultValue()))))
      .blockSize(Long.parseLong(meta.getProperty(OrcConf.BLOCK_SIZE.getAttribute(),
          String.valueOf(OrcConf.BLOCK_SIZE.getDefaultValue()))))
      .rowIndexStride(Integer.parseInt(meta.getProperty(OrcConf.ROW_INDEX_STRIDE.getAttribute(),
          String.valueOf(OrcConf.ROW_INDEX_STRIDE.getDefaultValue()))))
      .bufferSize(Integer.parseInt(meta.getProperty(OrcConf.BUFFER_SIZE.getAttribute(),
          String.valueOf(OrcConf.BUFFER_SIZE.getDefaultValue()))))
      .blockPadding(Boolean.parseBoolean(meta.getProperty(OrcConf.BLOCK_PADDING.getAttribute(),
          String.valueOf(OrcConf.BLOCK_PADDING.getDefaultValue()))))
      .encodingStrategy(EncodingStrategy.valueOf(meta.getProperty(OrcConf.ENCODING_STRATEGY.getAttribute(),
          String.valueOf(OrcConf.ENCODING_STRATEGY.getDefaultValue()))))
      .bloomFilterFpp(Double.parseDouble(meta.getProperty(OrcConf.BLOOM_FILTER_FPP.getAttribute(),
          String.valueOf(OrcConf.BLOOM_FILTER_FPP.getDefaultValue()))))
      .bloomFilterColumns(meta.getProperty(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(),
          String.valueOf(OrcConf.BLOOM_FILTER_COLUMNS.getDefaultValue())));
}

Source File: CompressionConfigUtil.java From presto with Apache License 2.0

6 votes

public static void configureCompression(Configuration config, HiveCompressionCodec compressionCodec)
{
    boolean compression = compressionCodec != HiveCompressionCodec.NONE;
    config.setBoolean(COMPRESSRESULT.varname, compression);
    config.setBoolean("mapred.output.compress", compression);
    config.setBoolean(FileOutputFormat.COMPRESS, compression);

    // For ORC
    OrcConf.COMPRESS.setString(config, compressionCodec.getOrcCompressionKind().name());

    // For RCFile and Text
    if (compressionCodec.getCodec().isPresent()) {
        config.set("mapred.output.compression.codec", compressionCodec.getCodec().get().getName());
        config.set(FileOutputFormat.COMPRESS_CODEC, compressionCodec.getCodec().get().getName());
    }
    else {
        config.unset("mapred.output.compression.codec");
        config.unset(FileOutputFormat.COMPRESS_CODEC);
    }

    // For Parquet
    config.set(ParquetOutputFormat.COMPRESSION, compressionCodec.getParquetCompressionCodec().name());

    // For SequenceFile
    config.set(FileOutputFormat.COMPRESS_TYPE, BLOCK.toString());
}

Source File: OrcFileWriterFactory.java From presto with Apache License 2.0

6 votes

private static CompressionKind getCompression(Properties schema, JobConf configuration)
{
    String compressionName = OrcConf.COMPRESS.getString(schema, configuration);
    if (compressionName == null) {
        return CompressionKind.ZLIB;
    }

    CompressionKind compression;
    try {
        compression = CompressionKind.valueOf(compressionName.toUpperCase(ENGLISH));
    }
    catch (IllegalArgumentException e) {
        throw new PrestoException(HIVE_UNSUPPORTED_FORMAT, "Unknown ORC compression type " + compressionName);
    }
    return compression;
}

Source File: TestOrcWrite.java From iceberg with Apache License 2.0

5 votes

@Test
public void testBasicWrite() throws IOException {
  File parent = temp.newFolder("orc");
  File location = new File(parent, "test");
  location.mkdirs();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build();
  Table table = tables.create(SCHEMA, spec, location.toString());
  table.updateProperties()
      .defaultFormat(FileFormat.ORC)
      .set(OrcConf.COMPRESS.getAttribute(), CompressionKind.NONE.name())
      .commit();

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  // TODO: incoming columns must be ordered according to the table's schema
  df.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(
      Encoders.bean(SimpleRecord.class)).collectAsList();

  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}

Source File: OrcTester.java From presto with Apache License 2.0

5 votes

static RecordWriter createOrcRecordWriter(File outputFile, Format format, CompressionKind compression, Type type)
        throws IOException
{
    JobConf jobConf = new JobConf();
    OrcConf.WRITE_FORMAT.setString(jobConf, format == ORC_12 ? "0.12" : "0.11");
    OrcConf.COMPRESS.setString(jobConf, compression.name());

    return new OrcOutputFormat().getHiveRecordWriter(
            jobConf,
            new Path(outputFile.toURI()),
            Text.class,
            compression != NONE,
            createTableProperties("test", getJavaObjectInspector(type).getTypeName()),
            () -> {});
}

Source File: OrcValueMapper.java From incubator-gobblin with Apache License 2.0

5 votes

@Override
protected void setup(Context context)
    throws IOException, InterruptedException {
  super.setup(context);
  this.jobConf = new JobConf(context.getConfiguration());
  this.outKey = new OrcKey();
  this.outKey.configure(jobConf);
  this.outValue = new OrcValue();
  this.outValue.configure(jobConf);
  this.mrOutputSchema =
      TypeDescription.fromString(context.getConfiguration().get(OrcConf.MAPRED_INPUT_SCHEMA.getAttribute()));
  this.shuffleKeySchema =
      TypeDescription.fromString(context.getConfiguration().get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()));
}

Source File: CompactionOrcJobConfigurator.java From incubator-gobblin with Apache License 2.0

5 votes

protected void configureSchema(Job job) throws IOException {
  TypeDescription schema = OrcUtils.getNewestSchemaFromSource(job, this.fs);

  job.getConfiguration().set(OrcConf.MAPRED_INPUT_SCHEMA.getAttribute(), schema.toString());
  job.getConfiguration().set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(),
      orcMapperShuffleSchemaString.isEmpty() ? schema.toString() : orcMapperShuffleSchemaString);
  job.getConfiguration().set(OrcConf.MAPRED_SHUFFLE_VALUE_SCHEMA.getAttribute(), schema.toString());
  job.getConfiguration().set(OrcConf.MAPRED_OUTPUT_SCHEMA.getAttribute(), schema.toString());
}

Source File: OrcNoHiveShim.java From flink with Apache License 2.0

5 votes

@Override
public RecordReader createRecordReader(
		Configuration conf,
		TypeDescription schema,
		int[] selectedFields,
		List<OrcSplitReader.Predicate> conjunctPredicates,
		org.apache.flink.core.fs.Path path,
		long splitStart,
		long splitLength) throws IOException {
	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(path.toUri());

	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(
			splitStart, splitLength, orcReader.getStripes());

	// create ORC row reader configuration
	Reader.Options options = new Reader.Options()
			.schema(schema)
			.range(offsetAndLength.f0, offsetAndLength.f1)
			.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
			.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
			.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// TODO configure filters

	// configure selected fields
	options.include(computeProjectionMask(schema, selectedFields));

	// create ORC row reader
	RecordReader orcRowsReader = orcReader.rows(options);

	// assign ids
	schema.getId();

	return orcRowsReader;
}

Source File: TestCompressionStorages.java From tajo with Apache License 2.0

4 votes

private void storageCompressionTest(String dataFormat, Class<? extends CompressionCodec> codec) throws IOException {
  Schema schema = SchemaBuilder.builder()
      .add("id", Type.INT4)
      .add("age", Type.FLOAT4)
      .add("name", Type.TEXT)
      .build();

  TableMeta meta = CatalogUtil.newTableMeta(dataFormat, conf);
  meta.putProperty("compression.codec", codec.getCanonicalName());
  meta.putProperty("compression.type", SequenceFile.CompressionType.BLOCK.name());
  meta.putProperty("rcfile.serde", TextSerializerDeserializer.class.getName());
  meta.putProperty("sequencefile.serde", TextSerializerDeserializer.class.getName());

  if (codec.equals(SnappyCodec.class)) {
    meta.putProperty(OrcConf.COMPRESS.getAttribute(), "SNAPPY");
  } else if (codec.equals(Lz4Codec.class)) {
    meta.putProperty(OrcConf.COMPRESS.getAttribute(), "ZLIB");
  } else {
    meta.putProperty(OrcConf.COMPRESS.getAttribute(), "NONE");
  }

  String fileName = "Compression_" + codec.getSimpleName();
  Path tablePath = new Path(testDir, fileName);
  Appender appender = ((FileTablespace) TablespaceManager.getLocalFs()).getAppender(meta, schema, tablePath);
  appender.enableStats();

  appender.init();

  String extension = "";
  if (appender instanceof DelimitedTextFile.DelimitedTextFileAppender) {
    extension = ((DelimitedTextFile.DelimitedTextFileAppender) appender).getExtension();
  }

  int tupleNum = 1000;
  VTuple vTuple;

  for (int i = 0; i < tupleNum; i++) {
    vTuple = new VTuple(3);
    vTuple.put(0, DatumFactory.createInt4(i + 1));
    vTuple.put(1, DatumFactory.createFloat4((float) i));
    vTuple.put(2, DatumFactory.createText(String.valueOf(i)));
    appender.addTuple(vTuple);
  }
  appender.close();

  TableStats stat = appender.getStats();
  assertEquals(tupleNum, stat.getNumRows().longValue());
  tablePath = tablePath.suffix(extension);
  FileStatus status = fs.getFileStatus(tablePath);
  long fileLen = status.getLen();
  FileFragment[] tablets = new FileFragment[1];
  tablets[0] = new FileFragment(fileName, tablePath, 0, fileLen);

  Scanner scanner = TablespaceManager.getLocalFs().getScanner(meta, schema, tablets[0], schema);
  scanner.init();

  if (dataFormat.equalsIgnoreCase(BuiltinStorages.SEQUENCE_FILE)) {
    assertTrue(scanner instanceof SequenceFileScanner);
    Writable key = ((SequenceFileScanner) scanner).getKey();
    assertEquals(key.getClass().getCanonicalName(), LongWritable.class.getCanonicalName());
  }

  int tupleCnt = 0;
  while ((scanner.next()) != null) {
    tupleCnt++;
  }
  scanner.close();
  assertEquals(tupleNum, tupleCnt);
  assertNotSame(appender.getStats().getNumBytes().longValue(), scanner.getInputStats().getNumBytes().longValue());
  assertEquals(appender.getStats().getNumRows().longValue(), scanner.getInputStats().getNumRows().longValue());
}

Source File: NiFiOrcUtils.java From nifi with Apache License 2.0

4 votes

public static Writer createWriter(
        Path path,
        Configuration conf,
        TypeInfo orcSchema,
        long stripeSize,
        CompressionKind compress,
        int bufferSize) throws IOException {

    int rowIndexStride = (int) OrcConf.ROW_INDEX_STRIDE.getLong(conf);

    boolean addBlockPadding = OrcConf.BLOCK_PADDING.getBoolean(conf);

    String versionName = OrcConf.WRITE_FORMAT.getString(conf);
    OrcFile.Version versionValue = (versionName == null)
            ? OrcFile.Version.CURRENT
            : OrcFile.Version.byName(versionName);

    OrcFile.EncodingStrategy encodingStrategy;
    String enString = OrcConf.ENCODING_STRATEGY.getString(conf);
    if (enString == null) {
        encodingStrategy = OrcFile.EncodingStrategy.SPEED;
    } else {
        encodingStrategy = OrcFile.EncodingStrategy.valueOf(enString);
    }

    final double paddingTolerance = OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(conf);

    long blockSizeValue = OrcConf.BLOCK_SIZE.getLong(conf);

    double bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(conf);

    ObjectInspector inspector = OrcStruct.createObjectInspector(orcSchema);

    OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(conf)
            .rowIndexStride(rowIndexStride)
            .blockPadding(addBlockPadding)
            .version(versionValue)
            .encodingStrategy(encodingStrategy)
            .paddingTolerance(paddingTolerance)
            .blockSize(blockSizeValue)
            .bloomFilterFpp(bloomFilterFpp)
            .memory(getMemoryManager(conf))
            .inspector(inspector)
            .stripeSize(stripeSize)
            .bufferSize(bufferSize)
            .compress(compress);

    return OrcFile.createWriter(path, writerOptions);
}

Source File: OrcKeyComparatorTest.java From incubator-gobblin with Apache License 2.0

4 votes

@Test
public void testComplexRecordUnion() throws Exception {
  OrcKeyComparator comparator = new OrcKeyComparator();
  Configuration conf = new Configuration();

  TypeDescription listSchema = TypeDescription.createList(TypeDescription.createString());

  TypeDescription nestedRecordSchema = TypeDescription.createStruct()
      .addField("x", TypeDescription.createInt())
      .addField("y", TypeDescription.createInt());

  TypeDescription unionSchema = TypeDescription.createUnion()
      .addUnionChild(TypeDescription.createInt())
      .addUnionChild(listSchema)
      .addUnionChild(nestedRecordSchema);

  TypeDescription schema =
      TypeDescription.createStruct()
          .addField("a", TypeDescription.createInt())
          .addField("b", unionSchema);

  conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), schema.toString());
  Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), schema.toString());
  comparator.setConf(conf);

  // base record
  OrcStruct record0 = (OrcStruct) OrcStruct.createValue(schema);
  record0.setFieldValue("a", new IntWritable(1));
  OrcStruct nestedRecord0 = createSimpleOrcStruct(nestedRecordSchema, 1, 2);
  OrcUnion orcUnion0 = createOrcUnion(unionSchema, nestedRecord0);
  record0.setFieldValue("b", orcUnion0);

  // same content as base record in diff objects.
  OrcStruct record1 = (OrcStruct) OrcStruct.createValue(schema);
  record1.setFieldValue("a", new IntWritable(1));
  OrcStruct nestedRecord1 = createSimpleOrcStruct(nestedRecordSchema, 1, 2);
  OrcUnion orcUnion1 = createOrcUnion(unionSchema, nestedRecord1);
  record1.setFieldValue("b", orcUnion1);

  // diff records inside union, record0 == record1 < 2
  OrcStruct record2 = (OrcStruct) OrcStruct.createValue(schema);
  record2.setFieldValue("a", new IntWritable(1));
  OrcStruct nestedRecord2 = createSimpleOrcStruct(nestedRecordSchema, 2, 2);
  OrcUnion orcUnion2 = createOrcUnion(unionSchema, nestedRecord2);
  record2.setFieldValue("b", orcUnion2);


  // differ in list inside union, record3 < record4 == record5
  OrcStruct record3 = (OrcStruct) OrcStruct.createValue(schema);
  record3.setFieldValue("a", new IntWritable(1));
  OrcList orcList3 = createOrcList(5, listSchema, 2);
  OrcUnion orcUnion3 = createOrcUnion(unionSchema, orcList3);
  record3.setFieldValue("b", orcUnion3);

  OrcStruct record4 = (OrcStruct) OrcStruct.createValue(schema);
  record4.setFieldValue("a", new IntWritable(1));
  OrcList orcList4 = createOrcList(6, listSchema, 2);
  OrcUnion orcUnion4 = createOrcUnion(unionSchema, orcList4);
  record4.setFieldValue("b", orcUnion4);

  OrcStruct record5 = (OrcStruct) OrcStruct.createValue(schema);
  record5.setFieldValue("a", new IntWritable(1));
  OrcList orcList5 = createOrcList(6, listSchema, 2);
  OrcUnion orcUnion5 = createOrcUnion(unionSchema, orcList5);
  record5.setFieldValue("b", orcUnion5);


  OrcKey orcKey0 = new OrcKey();
  orcKey0.key = record0;
  OrcKey orcKey1 = new OrcKey();
  orcKey1.key = record1;
  OrcKey orcKey2 = new OrcKey();
  orcKey2.key = record2;
  OrcKey orcKey3 = new OrcKey();
  orcKey3.key = record3;
  OrcKey orcKey4 = new OrcKey();
  orcKey4.key = record4;
  OrcKey orcKey5 = new OrcKey();
  orcKey5.key = record5;

  Assert.assertEquals(orcUnion0, orcUnion1);
  // Int value in orcKey2 is larger
  Assert.assertTrue(comparator.compare(orcKey0, orcKey2) < 0);
  Assert.assertTrue(comparator.compare(orcKey3, orcKey4) < 0 );
  Assert.assertTrue(comparator.compare(orcKey3, orcKey5) < 0);
  Assert.assertTrue(comparator.compare(orcKey4, orcKey5) == 0);
}

Source File: OrcKeyComparatorTest.java From incubator-gobblin with Apache License 2.0

4 votes

@Test
public void testComplexRecordMap() throws Exception {
  OrcKeyComparator comparator = new OrcKeyComparator();
  Configuration conf = new Configuration();
  TypeDescription mapFieldSchema =
      TypeDescription.createMap(TypeDescription.createString(), TypeDescription.createString());
  TypeDescription schema =
      TypeDescription.createStruct().addField("a", TypeDescription.createInt()).addField("b", mapFieldSchema);

  conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), schema.toString());
  Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), schema.toString());
  comparator.setConf(conf);

  // base record
  OrcStruct record0 = (OrcStruct) OrcStruct.createValue(schema);
  record0.setFieldValue("a", new IntWritable(1));
  OrcMap orcMap = createSimpleOrcMap(new Text("key"), new Text("value"), mapFieldSchema);
  record0.setFieldValue("b", orcMap);

  // key value both differ
  OrcStruct record1 = (OrcStruct) OrcStruct.createValue(schema);
  record1.setFieldValue("a", new IntWritable(1));
  OrcMap orcMap1 = createSimpleOrcMap(new Text("key_key"), new Text("value_value"), mapFieldSchema);
  record1.setFieldValue("b", orcMap1);

  // Key same, value differ
  OrcStruct record2 = (OrcStruct) OrcStruct.createValue(schema);
  record2.setFieldValue("a", new IntWritable(1));
  OrcMap orcMap2 = createSimpleOrcMap(new Text("key"), new Text("value_value"), mapFieldSchema);
  record2.setFieldValue("b", orcMap2);

  // Same as base
  OrcStruct record3 = (OrcStruct) OrcStruct.createValue(schema);
  record3.setFieldValue("a", new IntWritable(1));
  OrcMap orcMap3 = createSimpleOrcMap(new Text("key"), new Text("value"), mapFieldSchema);
  record3.setFieldValue("b", orcMap3);

  // Differ in other field.
  OrcStruct record4 = (OrcStruct) OrcStruct.createValue(schema);
  record4.setFieldValue("a", new IntWritable(2));
  record4.setFieldValue("b", orcMap);

  // Record with map containing multiple entries but inserted in different order.
  OrcStruct record6 = (OrcStruct) OrcStruct.createValue(schema);
  record6.setFieldValue("a", new IntWritable(1));
  OrcMap orcMap6 = createSimpleOrcMap(new Text("key"), new Text("value"), mapFieldSchema);
  orcMap6.put(new Text("keyLater"), new Text("valueLater"));
  record6.setFieldValue("b", orcMap6);

  OrcStruct record7 = (OrcStruct) OrcStruct.createValue(schema);
  record7.setFieldValue("a", new IntWritable(1));
  OrcMap orcMap7 = createSimpleOrcMap(new Text("keyLater"), new Text("valueLater"), mapFieldSchema);
  orcMap7.put(new Text("key"), new Text("value"));
  record7.setFieldValue("b", orcMap7);

  OrcKey orcKey0 = new OrcKey();
  orcKey0.key = record0;
  OrcKey orcKey1 = new OrcKey();
  orcKey1.key = record1;
  OrcKey orcKey2 = new OrcKey();
  orcKey2.key = record2;
  OrcKey orcKey3 = new OrcKey();
  orcKey3.key = record3;
  OrcKey orcKey4 = new OrcKey();
  orcKey4.key = record4;

  OrcKey orcKey6 = new OrcKey();
  orcKey6.key = record6;
  OrcKey orcKey7 = new OrcKey();
  orcKey7.key = record7;

  Assert.assertTrue(comparator.compare(orcKey0, orcKey1) < 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey2) > 0);
  Assert.assertTrue(comparator.compare(orcKey2, orcKey3) > 0);
  Assert.assertTrue(comparator.compare(orcKey0, orcKey3) == 0);
  Assert.assertTrue(comparator.compare(orcKey0, orcKey4) < 0);
  Assert.assertTrue(comparator.compare(orcKey6, orcKey7) == 0);
}

Source File: OrcKeyComparatorTest.java From incubator-gobblin with Apache License 2.0

4 votes

@Test
public void testComplexRecordArray() throws Exception {
  OrcKeyComparator comparator = new OrcKeyComparator();
  Configuration conf = new Configuration();

  TypeDescription listSchema = TypeDescription.createList(TypeDescription.createString());
  TypeDescription schema =
      TypeDescription.createStruct().addField("a", TypeDescription.createInt()).addField("b", listSchema);

  conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), schema.toString());
  Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), schema.toString());
  comparator.setConf(conf);

  // base record
  OrcStruct record0 = (OrcStruct) OrcStruct.createValue(schema);
  record0.setFieldValue("a", new IntWritable(1));
  OrcList orcList0 = createOrcList(3, listSchema, 3);
  record0.setFieldValue("b", orcList0);

  // the same as base but different object, expecting equal to each other.
  OrcStruct record1 = (OrcStruct) OrcStruct.createValue(schema);
  record1.setFieldValue("a", new IntWritable(1));
  OrcList orcList1 = createOrcList(3, listSchema, 3);
  record1.setFieldValue("b", orcList1);

  // Diff in int field
  OrcStruct record2 = (OrcStruct) OrcStruct.createValue(schema);
  record2.setFieldValue("a", new IntWritable(2));
  OrcList orcList2 = createOrcList(3, listSchema, 3);
  record2.setFieldValue("b", orcList2);

  // Diff in array field: 1
  OrcStruct record3 = (OrcStruct) OrcStruct.createValue(schema);
  record3.setFieldValue("a", new IntWritable(1));
  OrcList orcList3 = createOrcList(3, listSchema, 5);
  record3.setFieldValue("b", orcList3);

  // Diff in array field: 2
  OrcStruct record4 = (OrcStruct) OrcStruct.createValue(schema);
  record4.setFieldValue("a", new IntWritable(1));
  OrcList orcList4 = createOrcList(4, listSchema, 3);
  record4.setFieldValue("b", orcList4);

  OrcKey orcKey0 = new OrcKey();
  orcKey0.key = record0;
  OrcKey orcKey1 = new OrcKey();
  orcKey1.key = record1;
  OrcKey orcKey2 = new OrcKey();
  orcKey2.key = record2;
  OrcKey orcKey3 = new OrcKey();
  orcKey3.key = record3;
  OrcKey orcKey4 = new OrcKey();
  orcKey4.key = record4;

  Assert.assertTrue(comparator.compare(orcKey0, orcKey1) == 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey2) < 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey3) < 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey4) < 0);
}

Source File: OrcShimV200.java From flink with Apache License 2.0

4 votes

protected Reader.Options readOrcConf(Reader.Options options, Configuration conf) {
	return options.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
			.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf));
}

Source File: OrcShimV230.java From flink with Apache License 2.0

4 votes

@Override
protected Reader.Options readOrcConf(Reader.Options options, Configuration conf) {
	return super.readOrcConf(options, conf)
			.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));
}

Source File: OrcRowInputFormat.java From Flink-CEPplus with Apache License 2.0

4 votes

@Override
public void open(FileInputSplit fileSplit) throws IOException {

	LOG.debug("Opening ORC file {}", fileSplit.getPath());

	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(fileSplit.getPath().getPath());
	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(fileSplit, getStripes(orcReader));

	// create ORC row reader configuration
	Reader.Options options = getOptions(orcReader)
		.schema(schema)
		.range(offsetAndLength.f0, offsetAndLength.f1)
		.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
		.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
		.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// configure filters
	if (!conjunctPredicates.isEmpty()) {
		SearchArgument.Builder b = SearchArgumentFactory.newBuilder();
		b = b.startAnd();
		for (Predicate predicate : conjunctPredicates) {
			predicate.add(b);
		}
		b = b.end();
		options.searchArgument(b.build(), new String[]{});
	}

	// configure selected fields
	options.include(computeProjectionMask());

	// create ORC row reader
	this.orcRowsReader = orcReader.rows(options);

	// assign ids
	this.schema.getId();
	// create row batch
	this.rowBatch = schema.createRowBatch(batchSize);
	rowsInBatch = 0;
	nextRow = 0;
}

Source File: HiveConfFactory.java From dremio-oss with Apache License 2.0

4 votes

/**
 * Fills in a HiveConf instance with any user provided configuration parameters
 *
 * @param hiveConf - the conf to fill in
 * @param config - the user provided parameters
 * @return
 */
protected static void addUserProperties(HiveConf hiveConf, BaseHiveStoragePluginConfig<?,?> config) {
  // Used to capture properties set by user
  final Set<String> userPropertyNames = new HashSet<>();
  if(config.propertyList != null) {
    for(Property prop : config.propertyList) {
      userPropertyNames.add(prop.name);
      setConf(hiveConf, prop.name, prop.value);
      if(logger.isTraceEnabled()){
        logger.trace("HiveConfig Override {}={}", prop.name, prop.value);
      }
    }
  }

  // Check if zero-copy has been set by user
  boolean zeroCopySetByUser = userPropertyNames.contains(OrcConf.USE_ZEROCOPY.getAttribute());
  // Configure zero-copy for ORC reader
  if (!zeroCopySetByUser) {
    if (VM.isWindowsHost() || VM.isMacOSHost()) {
      logger.debug("MacOS or Windows host detected. Not automatically enabling ORC zero-copy feature");
    } else {
      String fs = hiveConf.get(FileSystem.FS_DEFAULT_NAME_KEY);
      // Equivalent to a case-insensitive startsWith...
      if (fs.regionMatches(true, 0, "maprfs", 0, 6)) {
        // DX-12672: do not enable ORC zero-copy on MapRFS
        logger.debug("MapRFS detected. Not automatically enabling ORC zero-copy feature");
      } else {
        logger.debug("Linux host detected. Enabling ORC zero-copy feature");
        hiveConf.set(OrcConf.USE_ZEROCOPY.getAttribute(), "true");
      }
    }
  } else {
    boolean useZeroCopy = OrcConf.USE_ZEROCOPY.getBoolean(hiveConf);
    if (useZeroCopy) {
      logger.warn("ORC zero-copy feature has been manually enabled. This is not recommended.");
    } else {
      logger.error("ORC zero-copy feature has been manually disabled. This is not recommended and might cause memory issues");
    }
  }

  // Check if fs.s3.impl has been set by user
  boolean fsS3ImplSetByUser = userPropertyNames.contains(FS_S3_IMPL);
  if (fsS3ImplSetByUser) {
    logger.warn(FS_S3_IMPL + " manually set. This is not recommended.");
  } else {
    logger.debug("Setting " + FS_S3_IMPL + " to " + FS_S3_IMPL_DEFAULT);
    setConf(hiveConf, FS_S3_IMPL, FS_S3_IMPL_DEFAULT);
  }

  ADL_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue()));
  WASB_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue()));
  ABFS_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue()));
}

Source File: ScanWithHiveReader.java From dremio-oss with Apache License 2.0

4 votes

private static Class<? extends HiveAbstractReader> getNativeReaderClass(Optional<String> formatName,
    OptionManager options, Configuration configuration, boolean mixedSchema, boolean isTransactional) {
  if (!formatName.isPresent()) {
    return HiveDefaultReader.class;
  }

  Class<? extends HiveAbstractReader> readerClass = readerMap.get(formatName.get());
  if (readerClass == HiveOrcReader.class) {
    // Validate reader
    if (OrcConf.USE_ZEROCOPY.getBoolean(configuration)) {
      if (!NativeCodeLoader.isNativeCodeLoaded()) {
        throw UserException.dataReadError()
            .message("Hadoop native library is required for Hive ORC data, but is not loaded").build(logger);
      }
      // TODO: find a way to access compression codec information?
      if (!SnappyDecompressor.isNativeCodeLoaded()) {
        throw UserException.dataReadError()
          .message("Snappy native library is required for Hive ORC data, but is not loaded").build(logger);
      }

      if (!isNativeZlibLoaded) {
        throw UserException
        .dataReadError()
        .message("Zlib native library is required for Hive ORC data, but is not loaded")
        .build(logger);
      }
    }

    if (new HiveSettings(options).vectorizeOrcReaders() && !mixedSchema && !isTransactional) {
      // We don't use vectorized ORC reader if there is a schema change between table and partitions or the table is
      // a transactional Hive table
      return HiveORCVectorizedReader.class;
    }
  }

  if (readerClass == null) {
    return HiveDefaultReader.class;
  }

  return readerClass;
}

Source File: HiveConfFactory.java From dremio-oss with Apache License 2.0

4 votes

/**
 * Fills in a HiveConf instance with any user provided configuration parameters
 *
 * @param hiveConf - the conf to fill in
 * @param config - the user provided parameters
 * @return
 */
protected static void addUserProperties(HiveConf hiveConf, BaseHiveStoragePluginConfig<?,?> config) {
  // Used to capture properties set by user
  final Set<String> userPropertyNames = new HashSet<>();
  if(config.propertyList != null) {
    for(Property prop : config.propertyList) {
      userPropertyNames.add(prop.name);
      setConf(hiveConf, prop.name, prop.value);
      if(logger.isTraceEnabled()){
        logger.trace("HiveConfig Override {}={}", prop.name, prop.value);
      }
    }
  }

  // Check if zero-copy has been set by user
  boolean zeroCopySetByUser = userPropertyNames.contains(OrcConf.USE_ZEROCOPY.getAttribute())
    || userPropertyNames.contains(HiveConf.ConfVars.HIVE_ORC_ZEROCOPY.varname);
  // Configure zero-copy for ORC reader
  if (!zeroCopySetByUser) {
    if (VM.isWindowsHost() || VM.isMacOSHost()) {
      logger.debug("MacOS or Windows host detected. Not automatically enabling ORC zero-copy feature");
    } else {
      String fs = hiveConf.get(FileSystem.FS_DEFAULT_NAME_KEY);
      // Equivalent to a case-insensitive startsWith...
      if (fs.regionMatches(true, 0, "maprfs", 0, 6)) {
        // DX-12672: do not enable ORC zero-copy on MapRFS
        logger.debug("MapRFS detected. Not automatically enabling ORC zero-copy feature");
      } else {
        logger.debug("Linux host detected. Enabling ORC zero-copy feature");
        setConf(hiveConf, HiveConf.ConfVars.HIVE_ORC_ZEROCOPY, true);
      }
    }
  } else {
    boolean useZeroCopy = OrcConf.USE_ZEROCOPY.getBoolean(hiveConf);
    if (useZeroCopy) {
      logger.warn("ORC zero-copy feature has been manually enabled. This is not recommended.");
    } else {
      logger.error("ORC zero-copy feature has been manually disabled. This is not recommended and might cause memory issues");
    }
  }

  // Check if ORC Footer cache has been configured by user
  boolean orcStripCacheSetByUser = userPropertyNames.contains(HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_SIZE.varname);
  if (orcStripCacheSetByUser) {
    logger.error("ORC stripe details cache has been manually configured. This is not recommended and might cause memory issues");
  } else {
    logger.debug("Disabling ORC stripe details cache.");
    setConf(hiveConf, HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_SIZE, 0);
  }

  // Check if fs.s3.impl has been set by user
  boolean fsS3ImplSetByUser = userPropertyNames.contains(FS_S3_IMPL);
  if (fsS3ImplSetByUser) {
    logger.warn(FS_S3_IMPL + " manually set. This is not recommended.");
  } else {
    logger.debug("Setting " + FS_S3_IMPL + " to " + FS_S3_IMPL_DEFAULT);
    setConf(hiveConf, FS_S3_IMPL, FS_S3_IMPL_DEFAULT);
  }

  ADL_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue()));
  WASB_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue()));
  ABFS_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue()));
}

Source File: ScanWithHiveReader.java From dremio-oss with Apache License 2.0

4 votes

private static Class<? extends HiveAbstractReader> getNativeReaderClass(Optional<String> formatName,
                                                                        OptionManager options, Configuration configuration, boolean mixedSchema, boolean isTransactional) {
  if (!formatName.isPresent()) {
    return HiveDefaultReader.class;
  }

  Class<? extends HiveAbstractReader> readerClass = readerMap.get(formatName.get());
  if (readerClass == HiveOrcReader.class) {
    // Validate reader
    if (OrcConf.USE_ZEROCOPY.getBoolean(configuration)) {
      if (!NativeCodeLoader.isNativeCodeLoaded()) {
        throw UserException.dataReadError()
            .message("Hadoop native library is required for Hive ORC data, but is not loaded").build(logger);
      }
      // TODO: find a way to access compression codec information?
      if (!SnappyDecompressor.isNativeCodeLoaded()) {
        throw UserException.dataReadError()
          .message("Snappy native library is required for Hive ORC data, but is not loaded").build(logger);
      }

      if (!isNativeZlibLoaded) {
        throw UserException
        .dataReadError()
        .message("Zlib native library is required for Hive ORC data, but is not loaded")
        .build(logger);
      }
    }

    if (new HiveSettings(options).vectorizeOrcReaders() && !mixedSchema && !isTransactional) {
      // We don't use vectorized ORC reader if there is a schema change between table and partitions or the table is
      // a transactional Hive table
      return HiveORCVectorizedReader.class;
    }
  }

  if (readerClass == null) {
    return HiveDefaultReader.class;
  }

  return readerClass;
}

Source File: ORC.java From iceberg with Apache License 2.0

4 votes

public ReadBuilder caseSensitive(boolean newCaseSensitive) {
  OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.setBoolean(this.conf, newCaseSensitive);
  this.caseSensitive = newCaseSensitive;
  return this;
}

Source File: ORC.java From iceberg with Apache License 2.0

4 votes

public WriteBuilder overwrite(boolean enabled) {
  OrcConf.OVERWRITE_OUTPUT_FILE.setBoolean(conf, enabled);
  return this;
}

Source File: OrcRowInputFormat.java From flink with Apache License 2.0

4 votes

@Override
public void open(FileInputSplit fileSplit) throws IOException {

	LOG.debug("Opening ORC file {}", fileSplit.getPath());

	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(fileSplit.getPath().getPath());
	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(fileSplit, getStripes(orcReader));

	// create ORC row reader configuration
	Reader.Options options = getOptions(orcReader)
		.schema(schema)
		.range(offsetAndLength.f0, offsetAndLength.f1)
		.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
		.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
		.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// configure filters
	if (!conjunctPredicates.isEmpty()) {
		SearchArgument.Builder b = SearchArgumentFactory.newBuilder();
		b = b.startAnd();
		for (Predicate predicate : conjunctPredicates) {
			predicate.add(b);
		}
		b = b.end();
		options.searchArgument(b.build(), new String[]{});
	}

	// configure selected fields
	options.include(computeProjectionMask());

	// create ORC row reader
	this.orcRowsReader = orcReader.rows(options);

	// assign ids
	this.schema.getId();
	// create row batch
	this.rowBatch = schema.createRowBatch(batchSize);
	rowsInBatch = 0;
	nextRow = 0;
}

org.apache.orc.OrcConf Java Examples