org.apache.orc.OrcConf Java Examples

The following examples show how to use org.apache.orc.OrcConf. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCachingOrcDataSource.java    From presto with Apache License 2.0 6 votes vote down vote up
private static FileSinkOperator.RecordWriter createOrcRecordWriter(File outputFile, Format format, CompressionKind compression, ObjectInspector columnObjectInspector)
        throws IOException
{
    JobConf jobConf = new JobConf();
    OrcConf.WRITE_FORMAT.setString(jobConf, format == ORC_12 ? "0.12" : "0.11");
    OrcConf.COMPRESS.setString(jobConf, compression.name());

    Properties tableProperties = new Properties();
    tableProperties.setProperty(IOConstants.COLUMNS, "test");
    tableProperties.setProperty(IOConstants.COLUMNS_TYPES, columnObjectInspector.getTypeName());
    tableProperties.setProperty(OrcConf.STRIPE_SIZE.getAttribute(), "120000");

    return new OrcOutputFormat().getHiveRecordWriter(
            jobConf,
            new Path(outputFile.toURI()),
            Text.class,
            compression != NONE,
            tableProperties,
            () -> {});
}
 
Example #2
Source File: OrcKeyComparatorTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
@Test
public void testSimpleComparator() throws Exception {
  OrcKeyComparator comparator = new OrcKeyComparator();
  Configuration conf = new Configuration();
  String orcSchema = "struct<i:int,j:int>";
  TypeDescription schema = TypeDescription.fromString(orcSchema);
  conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), orcSchema);
  Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), orcSchema);
  comparator.setConf(conf);

  OrcStruct record0 = createSimpleOrcStruct(schema, 1, 2);
  OrcStruct record1 = createSimpleOrcStruct(schema, 3, 4);
  OrcStruct record2 = createSimpleOrcStruct(schema, 3, 4);

  OrcKey orcKey0 = new OrcKey();
  orcKey0.key = record0;
  OrcKey orcKey1 = new OrcKey();
  orcKey1.key = record1;
  OrcKey orcKey2 = new OrcKey();
  orcKey2.key = record2;

  Assert.assertTrue(comparator.compare(orcKey0, orcKey1) < 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey2) == 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey0) > 0);
}
 
Example #3
Source File: ORCAppender.java    From tajo with Apache License 2.0 6 votes vote down vote up
private static CompressionKind getCompressionKind(TableMeta meta) {
  String kindstr = meta.getProperty(OrcConf.COMPRESS.getAttribute(),
      String.valueOf(OrcConf.COMPRESS.getDefaultValue()));

  if (kindstr.equalsIgnoreCase(CompressionKind.ZLIB.name())) {
    return CompressionKind.ZLIB;
  }

  if (kindstr.equalsIgnoreCase(CompressionKind.SNAPPY.name())) {
    return CompressionKind.SNAPPY;
  }

  if (kindstr.equalsIgnoreCase(CompressionKind.LZO.name())) {
    return CompressionKind.LZO;
  }

  return CompressionKind.NONE;
}
 
Example #4
Source File: ORCAppender.java    From tajo with Apache License 2.0 6 votes vote down vote up
private static OrcFile.WriterOptions buildWriterOptions(Configuration conf, TableMeta meta, Schema schema) {
  return OrcFile.writerOptions(conf)
      .setSchema(OrcUtils.convertSchema(schema))
      .compress(getCompressionKind(meta))
      .stripeSize(Long.parseLong(meta.getProperty(OrcConf.STRIPE_SIZE.getAttribute(),
          String.valueOf(OrcConf.STRIPE_SIZE.getDefaultValue()))))
      .blockSize(Long.parseLong(meta.getProperty(OrcConf.BLOCK_SIZE.getAttribute(),
          String.valueOf(OrcConf.BLOCK_SIZE.getDefaultValue()))))
      .rowIndexStride(Integer.parseInt(meta.getProperty(OrcConf.ROW_INDEX_STRIDE.getAttribute(),
          String.valueOf(OrcConf.ROW_INDEX_STRIDE.getDefaultValue()))))
      .bufferSize(Integer.parseInt(meta.getProperty(OrcConf.BUFFER_SIZE.getAttribute(),
          String.valueOf(OrcConf.BUFFER_SIZE.getDefaultValue()))))
      .blockPadding(Boolean.parseBoolean(meta.getProperty(OrcConf.BLOCK_PADDING.getAttribute(),
          String.valueOf(OrcConf.BLOCK_PADDING.getDefaultValue()))))
      .encodingStrategy(EncodingStrategy.valueOf(meta.getProperty(OrcConf.ENCODING_STRATEGY.getAttribute(),
          String.valueOf(OrcConf.ENCODING_STRATEGY.getDefaultValue()))))
      .bloomFilterFpp(Double.parseDouble(meta.getProperty(OrcConf.BLOOM_FILTER_FPP.getAttribute(),
          String.valueOf(OrcConf.BLOOM_FILTER_FPP.getDefaultValue()))))
      .bloomFilterColumns(meta.getProperty(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(),
          String.valueOf(OrcConf.BLOOM_FILTER_COLUMNS.getDefaultValue())));
}
 
Example #5
Source File: CompressionConfigUtil.java    From presto with Apache License 2.0 6 votes vote down vote up
public static void configureCompression(Configuration config, HiveCompressionCodec compressionCodec)
{
    boolean compression = compressionCodec != HiveCompressionCodec.NONE;
    config.setBoolean(COMPRESSRESULT.varname, compression);
    config.setBoolean("mapred.output.compress", compression);
    config.setBoolean(FileOutputFormat.COMPRESS, compression);

    // For ORC
    OrcConf.COMPRESS.setString(config, compressionCodec.getOrcCompressionKind().name());

    // For RCFile and Text
    if (compressionCodec.getCodec().isPresent()) {
        config.set("mapred.output.compression.codec", compressionCodec.getCodec().get().getName());
        config.set(FileOutputFormat.COMPRESS_CODEC, compressionCodec.getCodec().get().getName());
    }
    else {
        config.unset("mapred.output.compression.codec");
        config.unset(FileOutputFormat.COMPRESS_CODEC);
    }

    // For Parquet
    config.set(ParquetOutputFormat.COMPRESSION, compressionCodec.getParquetCompressionCodec().name());

    // For SequenceFile
    config.set(FileOutputFormat.COMPRESS_TYPE, BLOCK.toString());
}
 
Example #6
Source File: OrcFileWriterFactory.java    From presto with Apache License 2.0 6 votes vote down vote up
private static CompressionKind getCompression(Properties schema, JobConf configuration)
{
    String compressionName = OrcConf.COMPRESS.getString(schema, configuration);
    if (compressionName == null) {
        return CompressionKind.ZLIB;
    }

    CompressionKind compression;
    try {
        compression = CompressionKind.valueOf(compressionName.toUpperCase(ENGLISH));
    }
    catch (IllegalArgumentException e) {
        throw new PrestoException(HIVE_UNSUPPORTED_FORMAT, "Unknown ORC compression type " + compressionName);
    }
    return compression;
}
 
Example #7
Source File: TestOrcWrite.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testBasicWrite() throws IOException {
  File parent = temp.newFolder("orc");
  File location = new File(parent, "test");
  location.mkdirs();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build();
  Table table = tables.create(SCHEMA, spec, location.toString());
  table.updateProperties()
      .defaultFormat(FileFormat.ORC)
      .set(OrcConf.COMPRESS.getAttribute(), CompressionKind.NONE.name())
      .commit();

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  // TODO: incoming columns must be ordered according to the table's schema
  df.select("id", "data").write()
      .format("iceberg")
      .mode("append")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(
      Encoders.bean(SimpleRecord.class)).collectAsList();

  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}
 
Example #8
Source File: OrcTester.java    From presto with Apache License 2.0 5 votes vote down vote up
static RecordWriter createOrcRecordWriter(File outputFile, Format format, CompressionKind compression, Type type)
        throws IOException
{
    JobConf jobConf = new JobConf();
    OrcConf.WRITE_FORMAT.setString(jobConf, format == ORC_12 ? "0.12" : "0.11");
    OrcConf.COMPRESS.setString(jobConf, compression.name());

    return new OrcOutputFormat().getHiveRecordWriter(
            jobConf,
            new Path(outputFile.toURI()),
            Text.class,
            compression != NONE,
            createTableProperties("test", getJavaObjectInspector(type).getTypeName()),
            () -> {});
}
 
Example #9
Source File: OrcValueMapper.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@Override
protected void setup(Context context)
    throws IOException, InterruptedException {
  super.setup(context);
  this.jobConf = new JobConf(context.getConfiguration());
  this.outKey = new OrcKey();
  this.outKey.configure(jobConf);
  this.outValue = new OrcValue();
  this.outValue.configure(jobConf);
  this.mrOutputSchema =
      TypeDescription.fromString(context.getConfiguration().get(OrcConf.MAPRED_INPUT_SCHEMA.getAttribute()));
  this.shuffleKeySchema =
      TypeDescription.fromString(context.getConfiguration().get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()));
}
 
Example #10
Source File: CompactionOrcJobConfigurator.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
protected void configureSchema(Job job) throws IOException {
  TypeDescription schema = OrcUtils.getNewestSchemaFromSource(job, this.fs);

  job.getConfiguration().set(OrcConf.MAPRED_INPUT_SCHEMA.getAttribute(), schema.toString());
  job.getConfiguration().set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(),
      orcMapperShuffleSchemaString.isEmpty() ? schema.toString() : orcMapperShuffleSchemaString);
  job.getConfiguration().set(OrcConf.MAPRED_SHUFFLE_VALUE_SCHEMA.getAttribute(), schema.toString());
  job.getConfiguration().set(OrcConf.MAPRED_OUTPUT_SCHEMA.getAttribute(), schema.toString());
}
 
Example #11
Source File: OrcNoHiveShim.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public RecordReader createRecordReader(
		Configuration conf,
		TypeDescription schema,
		int[] selectedFields,
		List<OrcSplitReader.Predicate> conjunctPredicates,
		org.apache.flink.core.fs.Path path,
		long splitStart,
		long splitLength) throws IOException {
	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(path.toUri());

	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(
			splitStart, splitLength, orcReader.getStripes());

	// create ORC row reader configuration
	Reader.Options options = new Reader.Options()
			.schema(schema)
			.range(offsetAndLength.f0, offsetAndLength.f1)
			.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
			.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
			.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// TODO configure filters

	// configure selected fields
	options.include(computeProjectionMask(schema, selectedFields));

	// create ORC row reader
	RecordReader orcRowsReader = orcReader.rows(options);

	// assign ids
	schema.getId();

	return orcRowsReader;
}
 
Example #12
Source File: TestCompressionStorages.java    From tajo with Apache License 2.0 4 votes vote down vote up
private void storageCompressionTest(String dataFormat, Class<? extends CompressionCodec> codec) throws IOException {
  Schema schema = SchemaBuilder.builder()
      .add("id", Type.INT4)
      .add("age", Type.FLOAT4)
      .add("name", Type.TEXT)
      .build();

  TableMeta meta = CatalogUtil.newTableMeta(dataFormat, conf);
  meta.putProperty("compression.codec", codec.getCanonicalName());
  meta.putProperty("compression.type", SequenceFile.CompressionType.BLOCK.name());
  meta.putProperty("rcfile.serde", TextSerializerDeserializer.class.getName());
  meta.putProperty("sequencefile.serde", TextSerializerDeserializer.class.getName());

  if (codec.equals(SnappyCodec.class)) {
    meta.putProperty(OrcConf.COMPRESS.getAttribute(), "SNAPPY");
  } else if (codec.equals(Lz4Codec.class)) {
    meta.putProperty(OrcConf.COMPRESS.getAttribute(), "ZLIB");
  } else {
    meta.putProperty(OrcConf.COMPRESS.getAttribute(), "NONE");
  }

  String fileName = "Compression_" + codec.getSimpleName();
  Path tablePath = new Path(testDir, fileName);
  Appender appender = ((FileTablespace) TablespaceManager.getLocalFs()).getAppender(meta, schema, tablePath);
  appender.enableStats();

  appender.init();

  String extension = "";
  if (appender instanceof DelimitedTextFile.DelimitedTextFileAppender) {
    extension = ((DelimitedTextFile.DelimitedTextFileAppender) appender).getExtension();
  }

  int tupleNum = 1000;
  VTuple vTuple;

  for (int i = 0; i < tupleNum; i++) {
    vTuple = new VTuple(3);
    vTuple.put(0, DatumFactory.createInt4(i + 1));
    vTuple.put(1, DatumFactory.createFloat4((float) i));
    vTuple.put(2, DatumFactory.createText(String.valueOf(i)));
    appender.addTuple(vTuple);
  }
  appender.close();

  TableStats stat = appender.getStats();
  assertEquals(tupleNum, stat.getNumRows().longValue());
  tablePath = tablePath.suffix(extension);
  FileStatus status = fs.getFileStatus(tablePath);
  long fileLen = status.getLen();
  FileFragment[] tablets = new FileFragment[1];
  tablets[0] = new FileFragment(fileName, tablePath, 0, fileLen);

  Scanner scanner = TablespaceManager.getLocalFs().getScanner(meta, schema, tablets[0], schema);
  scanner.init();

  if (dataFormat.equalsIgnoreCase(BuiltinStorages.SEQUENCE_FILE)) {
    assertTrue(scanner instanceof SequenceFileScanner);
    Writable key = ((SequenceFileScanner) scanner).getKey();
    assertEquals(key.getClass().getCanonicalName(), LongWritable.class.getCanonicalName());
  }

  int tupleCnt = 0;
  while ((scanner.next()) != null) {
    tupleCnt++;
  }
  scanner.close();
  assertEquals(tupleNum, tupleCnt);
  assertNotSame(appender.getStats().getNumBytes().longValue(), scanner.getInputStats().getNumBytes().longValue());
  assertEquals(appender.getStats().getNumRows().longValue(), scanner.getInputStats().getNumRows().longValue());
}
 
Example #13
Source File: NiFiOrcUtils.java    From nifi with Apache License 2.0 4 votes vote down vote up
public static Writer createWriter(
        Path path,
        Configuration conf,
        TypeInfo orcSchema,
        long stripeSize,
        CompressionKind compress,
        int bufferSize) throws IOException {

    int rowIndexStride = (int) OrcConf.ROW_INDEX_STRIDE.getLong(conf);

    boolean addBlockPadding = OrcConf.BLOCK_PADDING.getBoolean(conf);

    String versionName = OrcConf.WRITE_FORMAT.getString(conf);
    OrcFile.Version versionValue = (versionName == null)
            ? OrcFile.Version.CURRENT
            : OrcFile.Version.byName(versionName);

    OrcFile.EncodingStrategy encodingStrategy;
    String enString = OrcConf.ENCODING_STRATEGY.getString(conf);
    if (enString == null) {
        encodingStrategy = OrcFile.EncodingStrategy.SPEED;
    } else {
        encodingStrategy = OrcFile.EncodingStrategy.valueOf(enString);
    }

    final double paddingTolerance = OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(conf);

    long blockSizeValue = OrcConf.BLOCK_SIZE.getLong(conf);

    double bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(conf);

    ObjectInspector inspector = OrcStruct.createObjectInspector(orcSchema);

    OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(conf)
            .rowIndexStride(rowIndexStride)
            .blockPadding(addBlockPadding)
            .version(versionValue)
            .encodingStrategy(encodingStrategy)
            .paddingTolerance(paddingTolerance)
            .blockSize(blockSizeValue)
            .bloomFilterFpp(bloomFilterFpp)
            .memory(getMemoryManager(conf))
            .inspector(inspector)
            .stripeSize(stripeSize)
            .bufferSize(bufferSize)
            .compress(compress);

    return OrcFile.createWriter(path, writerOptions);
}
 
Example #14
Source File: OrcKeyComparatorTest.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Test
public void testComplexRecordUnion() throws Exception {
  OrcKeyComparator comparator = new OrcKeyComparator();
  Configuration conf = new Configuration();

  TypeDescription listSchema = TypeDescription.createList(TypeDescription.createString());

  TypeDescription nestedRecordSchema = TypeDescription.createStruct()
      .addField("x", TypeDescription.createInt())
      .addField("y", TypeDescription.createInt());

  TypeDescription unionSchema = TypeDescription.createUnion()
      .addUnionChild(TypeDescription.createInt())
      .addUnionChild(listSchema)
      .addUnionChild(nestedRecordSchema);

  TypeDescription schema =
      TypeDescription.createStruct()
          .addField("a", TypeDescription.createInt())
          .addField("b", unionSchema);

  conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), schema.toString());
  Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), schema.toString());
  comparator.setConf(conf);

  // base record
  OrcStruct record0 = (OrcStruct) OrcStruct.createValue(schema);
  record0.setFieldValue("a", new IntWritable(1));
  OrcStruct nestedRecord0 = createSimpleOrcStruct(nestedRecordSchema, 1, 2);
  OrcUnion orcUnion0 = createOrcUnion(unionSchema, nestedRecord0);
  record0.setFieldValue("b", orcUnion0);

  // same content as base record in diff objects.
  OrcStruct record1 = (OrcStruct) OrcStruct.createValue(schema);
  record1.setFieldValue("a", new IntWritable(1));
  OrcStruct nestedRecord1 = createSimpleOrcStruct(nestedRecordSchema, 1, 2);
  OrcUnion orcUnion1 = createOrcUnion(unionSchema, nestedRecord1);
  record1.setFieldValue("b", orcUnion1);

  // diff records inside union, record0 == record1 < 2
  OrcStruct record2 = (OrcStruct) OrcStruct.createValue(schema);
  record2.setFieldValue("a", new IntWritable(1));
  OrcStruct nestedRecord2 = createSimpleOrcStruct(nestedRecordSchema, 2, 2);
  OrcUnion orcUnion2 = createOrcUnion(unionSchema, nestedRecord2);
  record2.setFieldValue("b", orcUnion2);


  // differ in list inside union, record3 < record4 == record5
  OrcStruct record3 = (OrcStruct) OrcStruct.createValue(schema);
  record3.setFieldValue("a", new IntWritable(1));
  OrcList orcList3 = createOrcList(5, listSchema, 2);
  OrcUnion orcUnion3 = createOrcUnion(unionSchema, orcList3);
  record3.setFieldValue("b", orcUnion3);

  OrcStruct record4 = (OrcStruct) OrcStruct.createValue(schema);
  record4.setFieldValue("a", new IntWritable(1));
  OrcList orcList4 = createOrcList(6, listSchema, 2);
  OrcUnion orcUnion4 = createOrcUnion(unionSchema, orcList4);
  record4.setFieldValue("b", orcUnion4);

  OrcStruct record5 = (OrcStruct) OrcStruct.createValue(schema);
  record5.setFieldValue("a", new IntWritable(1));
  OrcList orcList5 = createOrcList(6, listSchema, 2);
  OrcUnion orcUnion5 = createOrcUnion(unionSchema, orcList5);
  record5.setFieldValue("b", orcUnion5);


  OrcKey orcKey0 = new OrcKey();
  orcKey0.key = record0;
  OrcKey orcKey1 = new OrcKey();
  orcKey1.key = record1;
  OrcKey orcKey2 = new OrcKey();
  orcKey2.key = record2;
  OrcKey orcKey3 = new OrcKey();
  orcKey3.key = record3;
  OrcKey orcKey4 = new OrcKey();
  orcKey4.key = record4;
  OrcKey orcKey5 = new OrcKey();
  orcKey5.key = record5;

  Assert.assertEquals(orcUnion0, orcUnion1);
  // Int value in orcKey2 is larger
  Assert.assertTrue(comparator.compare(orcKey0, orcKey2) < 0);
  Assert.assertTrue(comparator.compare(orcKey3, orcKey4) < 0 );
  Assert.assertTrue(comparator.compare(orcKey3, orcKey5) < 0);
  Assert.assertTrue(comparator.compare(orcKey4, orcKey5) == 0);
}
 
Example #15
Source File: OrcKeyComparatorTest.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Test
public void testComplexRecordMap() throws Exception {
  OrcKeyComparator comparator = new OrcKeyComparator();
  Configuration conf = new Configuration();
  TypeDescription mapFieldSchema =
      TypeDescription.createMap(TypeDescription.createString(), TypeDescription.createString());
  TypeDescription schema =
      TypeDescription.createStruct().addField("a", TypeDescription.createInt()).addField("b", mapFieldSchema);

  conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), schema.toString());
  Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), schema.toString());
  comparator.setConf(conf);

  // base record
  OrcStruct record0 = (OrcStruct) OrcStruct.createValue(schema);
  record0.setFieldValue("a", new IntWritable(1));
  OrcMap orcMap = createSimpleOrcMap(new Text("key"), new Text("value"), mapFieldSchema);
  record0.setFieldValue("b", orcMap);

  // key value both differ
  OrcStruct record1 = (OrcStruct) OrcStruct.createValue(schema);
  record1.setFieldValue("a", new IntWritable(1));
  OrcMap orcMap1 = createSimpleOrcMap(new Text("key_key"), new Text("value_value"), mapFieldSchema);
  record1.setFieldValue("b", orcMap1);

  // Key same, value differ
  OrcStruct record2 = (OrcStruct) OrcStruct.createValue(schema);
  record2.setFieldValue("a", new IntWritable(1));
  OrcMap orcMap2 = createSimpleOrcMap(new Text("key"), new Text("value_value"), mapFieldSchema);
  record2.setFieldValue("b", orcMap2);

  // Same as base
  OrcStruct record3 = (OrcStruct) OrcStruct.createValue(schema);
  record3.setFieldValue("a", new IntWritable(1));
  OrcMap orcMap3 = createSimpleOrcMap(new Text("key"), new Text("value"), mapFieldSchema);
  record3.setFieldValue("b", orcMap3);

  // Differ in other field.
  OrcStruct record4 = (OrcStruct) OrcStruct.createValue(schema);
  record4.setFieldValue("a", new IntWritable(2));
  record4.setFieldValue("b", orcMap);

  // Record with map containing multiple entries but inserted in different order.
  OrcStruct record6 = (OrcStruct) OrcStruct.createValue(schema);
  record6.setFieldValue("a", new IntWritable(1));
  OrcMap orcMap6 = createSimpleOrcMap(new Text("key"), new Text("value"), mapFieldSchema);
  orcMap6.put(new Text("keyLater"), new Text("valueLater"));
  record6.setFieldValue("b", orcMap6);

  OrcStruct record7 = (OrcStruct) OrcStruct.createValue(schema);
  record7.setFieldValue("a", new IntWritable(1));
  OrcMap orcMap7 = createSimpleOrcMap(new Text("keyLater"), new Text("valueLater"), mapFieldSchema);
  orcMap7.put(new Text("key"), new Text("value"));
  record7.setFieldValue("b", orcMap7);

  OrcKey orcKey0 = new OrcKey();
  orcKey0.key = record0;
  OrcKey orcKey1 = new OrcKey();
  orcKey1.key = record1;
  OrcKey orcKey2 = new OrcKey();
  orcKey2.key = record2;
  OrcKey orcKey3 = new OrcKey();
  orcKey3.key = record3;
  OrcKey orcKey4 = new OrcKey();
  orcKey4.key = record4;

  OrcKey orcKey6 = new OrcKey();
  orcKey6.key = record6;
  OrcKey orcKey7 = new OrcKey();
  orcKey7.key = record7;

  Assert.assertTrue(comparator.compare(orcKey0, orcKey1) < 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey2) > 0);
  Assert.assertTrue(comparator.compare(orcKey2, orcKey3) > 0);
  Assert.assertTrue(comparator.compare(orcKey0, orcKey3) == 0);
  Assert.assertTrue(comparator.compare(orcKey0, orcKey4) < 0);
  Assert.assertTrue(comparator.compare(orcKey6, orcKey7) == 0);
}
 
Example #16
Source File: OrcKeyComparatorTest.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Test
public void testComplexRecordArray() throws Exception {
  OrcKeyComparator comparator = new OrcKeyComparator();
  Configuration conf = new Configuration();

  TypeDescription listSchema = TypeDescription.createList(TypeDescription.createString());
  TypeDescription schema =
      TypeDescription.createStruct().addField("a", TypeDescription.createInt()).addField("b", listSchema);

  conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), schema.toString());
  Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), schema.toString());
  comparator.setConf(conf);

  // base record
  OrcStruct record0 = (OrcStruct) OrcStruct.createValue(schema);
  record0.setFieldValue("a", new IntWritable(1));
  OrcList orcList0 = createOrcList(3, listSchema, 3);
  record0.setFieldValue("b", orcList0);

  // the same as base but different object, expecting equal to each other.
  OrcStruct record1 = (OrcStruct) OrcStruct.createValue(schema);
  record1.setFieldValue("a", new IntWritable(1));
  OrcList orcList1 = createOrcList(3, listSchema, 3);
  record1.setFieldValue("b", orcList1);

  // Diff in int field
  OrcStruct record2 = (OrcStruct) OrcStruct.createValue(schema);
  record2.setFieldValue("a", new IntWritable(2));
  OrcList orcList2 = createOrcList(3, listSchema, 3);
  record2.setFieldValue("b", orcList2);

  // Diff in array field: 1
  OrcStruct record3 = (OrcStruct) OrcStruct.createValue(schema);
  record3.setFieldValue("a", new IntWritable(1));
  OrcList orcList3 = createOrcList(3, listSchema, 5);
  record3.setFieldValue("b", orcList3);

  // Diff in array field: 2
  OrcStruct record4 = (OrcStruct) OrcStruct.createValue(schema);
  record4.setFieldValue("a", new IntWritable(1));
  OrcList orcList4 = createOrcList(4, listSchema, 3);
  record4.setFieldValue("b", orcList4);

  OrcKey orcKey0 = new OrcKey();
  orcKey0.key = record0;
  OrcKey orcKey1 = new OrcKey();
  orcKey1.key = record1;
  OrcKey orcKey2 = new OrcKey();
  orcKey2.key = record2;
  OrcKey orcKey3 = new OrcKey();
  orcKey3.key = record3;
  OrcKey orcKey4 = new OrcKey();
  orcKey4.key = record4;

  Assert.assertTrue(comparator.compare(orcKey0, orcKey1) == 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey2) < 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey3) < 0);
  Assert.assertTrue(comparator.compare(orcKey1, orcKey4) < 0);
}
 
Example #17
Source File: OrcShimV200.java    From flink with Apache License 2.0 4 votes vote down vote up
protected Reader.Options readOrcConf(Reader.Options options, Configuration conf) {
	return options.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
			.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf));
}
 
Example #18
Source File: OrcShimV230.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
protected Reader.Options readOrcConf(Reader.Options options, Configuration conf) {
	return super.readOrcConf(options, conf)
			.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));
}
 
Example #19
Source File: OrcRowInputFormat.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
@Override
public void open(FileInputSplit fileSplit) throws IOException {

	LOG.debug("Opening ORC file {}", fileSplit.getPath());

	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(fileSplit.getPath().getPath());
	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(fileSplit, getStripes(orcReader));

	// create ORC row reader configuration
	Reader.Options options = getOptions(orcReader)
		.schema(schema)
		.range(offsetAndLength.f0, offsetAndLength.f1)
		.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
		.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
		.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// configure filters
	if (!conjunctPredicates.isEmpty()) {
		SearchArgument.Builder b = SearchArgumentFactory.newBuilder();
		b = b.startAnd();
		for (Predicate predicate : conjunctPredicates) {
			predicate.add(b);
		}
		b = b.end();
		options.searchArgument(b.build(), new String[]{});
	}

	// configure selected fields
	options.include(computeProjectionMask());

	// create ORC row reader
	this.orcRowsReader = orcReader.rows(options);

	// assign ids
	this.schema.getId();
	// create row batch
	this.rowBatch = schema.createRowBatch(batchSize);
	rowsInBatch = 0;
	nextRow = 0;
}
 
Example #20
Source File: HiveConfFactory.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
/**
 * Fills in a HiveConf instance with any user provided configuration parameters
 *
 * @param hiveConf - the conf to fill in
 * @param config - the user provided parameters
 * @return
 */
protected static void addUserProperties(HiveConf hiveConf, BaseHiveStoragePluginConfig<?,?> config) {
  // Used to capture properties set by user
  final Set<String> userPropertyNames = new HashSet<>();
  if(config.propertyList != null) {
    for(Property prop : config.propertyList) {
      userPropertyNames.add(prop.name);
      setConf(hiveConf, prop.name, prop.value);
      if(logger.isTraceEnabled()){
        logger.trace("HiveConfig Override {}={}", prop.name, prop.value);
      }
    }
  }

  // Check if zero-copy has been set by user
  boolean zeroCopySetByUser = userPropertyNames.contains(OrcConf.USE_ZEROCOPY.getAttribute());
  // Configure zero-copy for ORC reader
  if (!zeroCopySetByUser) {
    if (VM.isWindowsHost() || VM.isMacOSHost()) {
      logger.debug("MacOS or Windows host detected. Not automatically enabling ORC zero-copy feature");
    } else {
      String fs = hiveConf.get(FileSystem.FS_DEFAULT_NAME_KEY);
      // Equivalent to a case-insensitive startsWith...
      if (fs.regionMatches(true, 0, "maprfs", 0, 6)) {
        // DX-12672: do not enable ORC zero-copy on MapRFS
        logger.debug("MapRFS detected. Not automatically enabling ORC zero-copy feature");
      } else {
        logger.debug("Linux host detected. Enabling ORC zero-copy feature");
        hiveConf.set(OrcConf.USE_ZEROCOPY.getAttribute(), "true");
      }
    }
  } else {
    boolean useZeroCopy = OrcConf.USE_ZEROCOPY.getBoolean(hiveConf);
    if (useZeroCopy) {
      logger.warn("ORC zero-copy feature has been manually enabled. This is not recommended.");
    } else {
      logger.error("ORC zero-copy feature has been manually disabled. This is not recommended and might cause memory issues");
    }
  }

  // Check if fs.s3.impl has been set by user
  boolean fsS3ImplSetByUser = userPropertyNames.contains(FS_S3_IMPL);
  if (fsS3ImplSetByUser) {
    logger.warn(FS_S3_IMPL + " manually set. This is not recommended.");
  } else {
    logger.debug("Setting " + FS_S3_IMPL + " to " + FS_S3_IMPL_DEFAULT);
    setConf(hiveConf, FS_S3_IMPL, FS_S3_IMPL_DEFAULT);
  }

  ADL_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue()));
  WASB_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue()));
  ABFS_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue()));
}
 
Example #21
Source File: ScanWithHiveReader.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
private static Class<? extends HiveAbstractReader> getNativeReaderClass(Optional<String> formatName,
    OptionManager options, Configuration configuration, boolean mixedSchema, boolean isTransactional) {
  if (!formatName.isPresent()) {
    return HiveDefaultReader.class;
  }

  Class<? extends HiveAbstractReader> readerClass = readerMap.get(formatName.get());
  if (readerClass == HiveOrcReader.class) {
    // Validate reader
    if (OrcConf.USE_ZEROCOPY.getBoolean(configuration)) {
      if (!NativeCodeLoader.isNativeCodeLoaded()) {
        throw UserException.dataReadError()
            .message("Hadoop native library is required for Hive ORC data, but is not loaded").build(logger);
      }
      // TODO: find a way to access compression codec information?
      if (!SnappyDecompressor.isNativeCodeLoaded()) {
        throw UserException.dataReadError()
          .message("Snappy native library is required for Hive ORC data, but is not loaded").build(logger);
      }

      if (!isNativeZlibLoaded) {
        throw UserException
        .dataReadError()
        .message("Zlib native library is required for Hive ORC data, but is not loaded")
        .build(logger);
      }
    }

    if (new HiveSettings(options).vectorizeOrcReaders() && !mixedSchema && !isTransactional) {
      // We don't use vectorized ORC reader if there is a schema change between table and partitions or the table is
      // a transactional Hive table
      return HiveORCVectorizedReader.class;
    }
  }

  if (readerClass == null) {
    return HiveDefaultReader.class;
  }

  return readerClass;
}
 
Example #22
Source File: HiveConfFactory.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
/**
 * Fills in a HiveConf instance with any user provided configuration parameters
 *
 * @param hiveConf - the conf to fill in
 * @param config - the user provided parameters
 * @return
 */
protected static void addUserProperties(HiveConf hiveConf, BaseHiveStoragePluginConfig<?,?> config) {
  // Used to capture properties set by user
  final Set<String> userPropertyNames = new HashSet<>();
  if(config.propertyList != null) {
    for(Property prop : config.propertyList) {
      userPropertyNames.add(prop.name);
      setConf(hiveConf, prop.name, prop.value);
      if(logger.isTraceEnabled()){
        logger.trace("HiveConfig Override {}={}", prop.name, prop.value);
      }
    }
  }

  // Check if zero-copy has been set by user
  boolean zeroCopySetByUser = userPropertyNames.contains(OrcConf.USE_ZEROCOPY.getAttribute())
    || userPropertyNames.contains(HiveConf.ConfVars.HIVE_ORC_ZEROCOPY.varname);
  // Configure zero-copy for ORC reader
  if (!zeroCopySetByUser) {
    if (VM.isWindowsHost() || VM.isMacOSHost()) {
      logger.debug("MacOS or Windows host detected. Not automatically enabling ORC zero-copy feature");
    } else {
      String fs = hiveConf.get(FileSystem.FS_DEFAULT_NAME_KEY);
      // Equivalent to a case-insensitive startsWith...
      if (fs.regionMatches(true, 0, "maprfs", 0, 6)) {
        // DX-12672: do not enable ORC zero-copy on MapRFS
        logger.debug("MapRFS detected. Not automatically enabling ORC zero-copy feature");
      } else {
        logger.debug("Linux host detected. Enabling ORC zero-copy feature");
        setConf(hiveConf, HiveConf.ConfVars.HIVE_ORC_ZEROCOPY, true);
      }
    }
  } else {
    boolean useZeroCopy = OrcConf.USE_ZEROCOPY.getBoolean(hiveConf);
    if (useZeroCopy) {
      logger.warn("ORC zero-copy feature has been manually enabled. This is not recommended.");
    } else {
      logger.error("ORC zero-copy feature has been manually disabled. This is not recommended and might cause memory issues");
    }
  }

  // Check if ORC Footer cache has been configured by user
  boolean orcStripCacheSetByUser = userPropertyNames.contains(HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_SIZE.varname);
  if (orcStripCacheSetByUser) {
    logger.error("ORC stripe details cache has been manually configured. This is not recommended and might cause memory issues");
  } else {
    logger.debug("Disabling ORC stripe details cache.");
    setConf(hiveConf, HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_SIZE, 0);
  }

  // Check if fs.s3.impl has been set by user
  boolean fsS3ImplSetByUser = userPropertyNames.contains(FS_S3_IMPL);
  if (fsS3ImplSetByUser) {
    logger.warn(FS_S3_IMPL + " manually set. This is not recommended.");
  } else {
    logger.debug("Setting " + FS_S3_IMPL + " to " + FS_S3_IMPL_DEFAULT);
    setConf(hiveConf, FS_S3_IMPL, FS_S3_IMPL_DEFAULT);
  }

  ADL_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue()));
  WASB_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue()));
  ABFS_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue()));
}
 
Example #23
Source File: ScanWithHiveReader.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
private static Class<? extends HiveAbstractReader> getNativeReaderClass(Optional<String> formatName,
                                                                        OptionManager options, Configuration configuration, boolean mixedSchema, boolean isTransactional) {
  if (!formatName.isPresent()) {
    return HiveDefaultReader.class;
  }

  Class<? extends HiveAbstractReader> readerClass = readerMap.get(formatName.get());
  if (readerClass == HiveOrcReader.class) {
    // Validate reader
    if (OrcConf.USE_ZEROCOPY.getBoolean(configuration)) {
      if (!NativeCodeLoader.isNativeCodeLoaded()) {
        throw UserException.dataReadError()
            .message("Hadoop native library is required for Hive ORC data, but is not loaded").build(logger);
      }
      // TODO: find a way to access compression codec information?
      if (!SnappyDecompressor.isNativeCodeLoaded()) {
        throw UserException.dataReadError()
          .message("Snappy native library is required for Hive ORC data, but is not loaded").build(logger);
      }

      if (!isNativeZlibLoaded) {
        throw UserException
        .dataReadError()
        .message("Zlib native library is required for Hive ORC data, but is not loaded")
        .build(logger);
      }
    }

    if (new HiveSettings(options).vectorizeOrcReaders() && !mixedSchema && !isTransactional) {
      // We don't use vectorized ORC reader if there is a schema change between table and partitions or the table is
      // a transactional Hive table
      return HiveORCVectorizedReader.class;
    }
  }

  if (readerClass == null) {
    return HiveDefaultReader.class;
  }

  return readerClass;
}
 
Example #24
Source File: ORC.java    From iceberg with Apache License 2.0 4 votes vote down vote up
public ReadBuilder caseSensitive(boolean newCaseSensitive) {
  OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.setBoolean(this.conf, newCaseSensitive);
  this.caseSensitive = newCaseSensitive;
  return this;
}
 
Example #25
Source File: ORC.java    From iceberg with Apache License 2.0 4 votes vote down vote up
public WriteBuilder overwrite(boolean enabled) {
  OrcConf.OVERWRITE_OUTPUT_FILE.setBoolean(conf, enabled);
  return this;
}
 
Example #26
Source File: OrcRowInputFormat.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
public void open(FileInputSplit fileSplit) throws IOException {

	LOG.debug("Opening ORC file {}", fileSplit.getPath());

	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(fileSplit.getPath().getPath());
	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(fileSplit, getStripes(orcReader));

	// create ORC row reader configuration
	Reader.Options options = getOptions(orcReader)
		.schema(schema)
		.range(offsetAndLength.f0, offsetAndLength.f1)
		.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
		.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
		.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// configure filters
	if (!conjunctPredicates.isEmpty()) {
		SearchArgument.Builder b = SearchArgumentFactory.newBuilder();
		b = b.startAnd();
		for (Predicate predicate : conjunctPredicates) {
			predicate.add(b);
		}
		b = b.end();
		options.searchArgument(b.build(), new String[]{});
	}

	// configure selected fields
	options.include(computeProjectionMask());

	// create ORC row reader
	this.orcRowsReader = orcReader.rows(options);

	// assign ids
	this.schema.getId();
	// create row batch
	this.rowBatch = schema.createRowBatch(batchSize);
	rowsInBatch = 0;
	nextRow = 0;
}