org.apache.orc.OrcConf Java Examples
The following examples show how to use
org.apache.orc.OrcConf.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCachingOrcDataSource.java From presto with Apache License 2.0 | 6 votes |
private static FileSinkOperator.RecordWriter createOrcRecordWriter(File outputFile, Format format, CompressionKind compression, ObjectInspector columnObjectInspector) throws IOException { JobConf jobConf = new JobConf(); OrcConf.WRITE_FORMAT.setString(jobConf, format == ORC_12 ? "0.12" : "0.11"); OrcConf.COMPRESS.setString(jobConf, compression.name()); Properties tableProperties = new Properties(); tableProperties.setProperty(IOConstants.COLUMNS, "test"); tableProperties.setProperty(IOConstants.COLUMNS_TYPES, columnObjectInspector.getTypeName()); tableProperties.setProperty(OrcConf.STRIPE_SIZE.getAttribute(), "120000"); return new OrcOutputFormat().getHiveRecordWriter( jobConf, new Path(outputFile.toURI()), Text.class, compression != NONE, tableProperties, () -> {}); }
Example #2
Source File: OrcKeyComparatorTest.java From incubator-gobblin with Apache License 2.0 | 6 votes |
@Test public void testSimpleComparator() throws Exception { OrcKeyComparator comparator = new OrcKeyComparator(); Configuration conf = new Configuration(); String orcSchema = "struct<i:int,j:int>"; TypeDescription schema = TypeDescription.fromString(orcSchema); conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), orcSchema); Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), orcSchema); comparator.setConf(conf); OrcStruct record0 = createSimpleOrcStruct(schema, 1, 2); OrcStruct record1 = createSimpleOrcStruct(schema, 3, 4); OrcStruct record2 = createSimpleOrcStruct(schema, 3, 4); OrcKey orcKey0 = new OrcKey(); orcKey0.key = record0; OrcKey orcKey1 = new OrcKey(); orcKey1.key = record1; OrcKey orcKey2 = new OrcKey(); orcKey2.key = record2; Assert.assertTrue(comparator.compare(orcKey0, orcKey1) < 0); Assert.assertTrue(comparator.compare(orcKey1, orcKey2) == 0); Assert.assertTrue(comparator.compare(orcKey1, orcKey0) > 0); }
Example #3
Source File: ORCAppender.java From tajo with Apache License 2.0 | 6 votes |
private static CompressionKind getCompressionKind(TableMeta meta) { String kindstr = meta.getProperty(OrcConf.COMPRESS.getAttribute(), String.valueOf(OrcConf.COMPRESS.getDefaultValue())); if (kindstr.equalsIgnoreCase(CompressionKind.ZLIB.name())) { return CompressionKind.ZLIB; } if (kindstr.equalsIgnoreCase(CompressionKind.SNAPPY.name())) { return CompressionKind.SNAPPY; } if (kindstr.equalsIgnoreCase(CompressionKind.LZO.name())) { return CompressionKind.LZO; } return CompressionKind.NONE; }
Example #4
Source File: ORCAppender.java From tajo with Apache License 2.0 | 6 votes |
private static OrcFile.WriterOptions buildWriterOptions(Configuration conf, TableMeta meta, Schema schema) { return OrcFile.writerOptions(conf) .setSchema(OrcUtils.convertSchema(schema)) .compress(getCompressionKind(meta)) .stripeSize(Long.parseLong(meta.getProperty(OrcConf.STRIPE_SIZE.getAttribute(), String.valueOf(OrcConf.STRIPE_SIZE.getDefaultValue())))) .blockSize(Long.parseLong(meta.getProperty(OrcConf.BLOCK_SIZE.getAttribute(), String.valueOf(OrcConf.BLOCK_SIZE.getDefaultValue())))) .rowIndexStride(Integer.parseInt(meta.getProperty(OrcConf.ROW_INDEX_STRIDE.getAttribute(), String.valueOf(OrcConf.ROW_INDEX_STRIDE.getDefaultValue())))) .bufferSize(Integer.parseInt(meta.getProperty(OrcConf.BUFFER_SIZE.getAttribute(), String.valueOf(OrcConf.BUFFER_SIZE.getDefaultValue())))) .blockPadding(Boolean.parseBoolean(meta.getProperty(OrcConf.BLOCK_PADDING.getAttribute(), String.valueOf(OrcConf.BLOCK_PADDING.getDefaultValue())))) .encodingStrategy(EncodingStrategy.valueOf(meta.getProperty(OrcConf.ENCODING_STRATEGY.getAttribute(), String.valueOf(OrcConf.ENCODING_STRATEGY.getDefaultValue())))) .bloomFilterFpp(Double.parseDouble(meta.getProperty(OrcConf.BLOOM_FILTER_FPP.getAttribute(), String.valueOf(OrcConf.BLOOM_FILTER_FPP.getDefaultValue())))) .bloomFilterColumns(meta.getProperty(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(), String.valueOf(OrcConf.BLOOM_FILTER_COLUMNS.getDefaultValue()))); }
Example #5
Source File: CompressionConfigUtil.java From presto with Apache License 2.0 | 6 votes |
public static void configureCompression(Configuration config, HiveCompressionCodec compressionCodec) { boolean compression = compressionCodec != HiveCompressionCodec.NONE; config.setBoolean(COMPRESSRESULT.varname, compression); config.setBoolean("mapred.output.compress", compression); config.setBoolean(FileOutputFormat.COMPRESS, compression); // For ORC OrcConf.COMPRESS.setString(config, compressionCodec.getOrcCompressionKind().name()); // For RCFile and Text if (compressionCodec.getCodec().isPresent()) { config.set("mapred.output.compression.codec", compressionCodec.getCodec().get().getName()); config.set(FileOutputFormat.COMPRESS_CODEC, compressionCodec.getCodec().get().getName()); } else { config.unset("mapred.output.compression.codec"); config.unset(FileOutputFormat.COMPRESS_CODEC); } // For Parquet config.set(ParquetOutputFormat.COMPRESSION, compressionCodec.getParquetCompressionCodec().name()); // For SequenceFile config.set(FileOutputFormat.COMPRESS_TYPE, BLOCK.toString()); }
Example #6
Source File: OrcFileWriterFactory.java From presto with Apache License 2.0 | 6 votes |
private static CompressionKind getCompression(Properties schema, JobConf configuration) { String compressionName = OrcConf.COMPRESS.getString(schema, configuration); if (compressionName == null) { return CompressionKind.ZLIB; } CompressionKind compression; try { compression = CompressionKind.valueOf(compressionName.toUpperCase(ENGLISH)); } catch (IllegalArgumentException e) { throw new PrestoException(HIVE_UNSUPPORTED_FORMAT, "Unknown ORC compression type " + compressionName); } return compression; }
Example #7
Source File: TestOrcWrite.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testBasicWrite() throws IOException { File parent = temp.newFolder("orc"); File location = new File(parent, "test"); location.mkdirs(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("data").build(); Table table = tables.create(SCHEMA, spec, location.toString()); table.updateProperties() .defaultFormat(FileFormat.ORC) .set(OrcConf.COMPRESS.getAttribute(), CompressionKind.NONE.name()) .commit(); List<SimpleRecord> expected = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class); // TODO: incoming columns must be ordered according to the table's schema df.select("id", "data").write() .format("iceberg") .mode("append") .save(location.toString()); table.refresh(); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<SimpleRecord> actual = result.orderBy("id").as( Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); }
Example #8
Source File: OrcTester.java From presto with Apache License 2.0 | 5 votes |
static RecordWriter createOrcRecordWriter(File outputFile, Format format, CompressionKind compression, Type type) throws IOException { JobConf jobConf = new JobConf(); OrcConf.WRITE_FORMAT.setString(jobConf, format == ORC_12 ? "0.12" : "0.11"); OrcConf.COMPRESS.setString(jobConf, compression.name()); return new OrcOutputFormat().getHiveRecordWriter( jobConf, new Path(outputFile.toURI()), Text.class, compression != NONE, createTableProperties("test", getJavaObjectInspector(type).getTypeName()), () -> {}); }
Example #9
Source File: OrcValueMapper.java From incubator-gobblin with Apache License 2.0 | 5 votes |
@Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); this.jobConf = new JobConf(context.getConfiguration()); this.outKey = new OrcKey(); this.outKey.configure(jobConf); this.outValue = new OrcValue(); this.outValue.configure(jobConf); this.mrOutputSchema = TypeDescription.fromString(context.getConfiguration().get(OrcConf.MAPRED_INPUT_SCHEMA.getAttribute())); this.shuffleKeySchema = TypeDescription.fromString(context.getConfiguration().get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute())); }
Example #10
Source File: CompactionOrcJobConfigurator.java From incubator-gobblin with Apache License 2.0 | 5 votes |
protected void configureSchema(Job job) throws IOException { TypeDescription schema = OrcUtils.getNewestSchemaFromSource(job, this.fs); job.getConfiguration().set(OrcConf.MAPRED_INPUT_SCHEMA.getAttribute(), schema.toString()); job.getConfiguration().set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), orcMapperShuffleSchemaString.isEmpty() ? schema.toString() : orcMapperShuffleSchemaString); job.getConfiguration().set(OrcConf.MAPRED_SHUFFLE_VALUE_SCHEMA.getAttribute(), schema.toString()); job.getConfiguration().set(OrcConf.MAPRED_OUTPUT_SCHEMA.getAttribute(), schema.toString()); }
Example #11
Source File: OrcNoHiveShim.java From flink with Apache License 2.0 | 5 votes |
@Override public RecordReader createRecordReader( Configuration conf, TypeDescription schema, int[] selectedFields, List<OrcSplitReader.Predicate> conjunctPredicates, org.apache.flink.core.fs.Path path, long splitStart, long splitLength) throws IOException { // open ORC file and create reader org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(path.toUri()); Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf)); // get offset and length for the stripes that start in the split Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit( splitStart, splitLength, orcReader.getStripes()); // create ORC row reader configuration Reader.Options options = new Reader.Options() .schema(schema) .range(offsetAndLength.f0, offsetAndLength.f1) .useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf)) .skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf)) .tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf)); // TODO configure filters // configure selected fields options.include(computeProjectionMask(schema, selectedFields)); // create ORC row reader RecordReader orcRowsReader = orcReader.rows(options); // assign ids schema.getId(); return orcRowsReader; }
Example #12
Source File: TestCompressionStorages.java From tajo with Apache License 2.0 | 4 votes |
private void storageCompressionTest(String dataFormat, Class<? extends CompressionCodec> codec) throws IOException { Schema schema = SchemaBuilder.builder() .add("id", Type.INT4) .add("age", Type.FLOAT4) .add("name", Type.TEXT) .build(); TableMeta meta = CatalogUtil.newTableMeta(dataFormat, conf); meta.putProperty("compression.codec", codec.getCanonicalName()); meta.putProperty("compression.type", SequenceFile.CompressionType.BLOCK.name()); meta.putProperty("rcfile.serde", TextSerializerDeserializer.class.getName()); meta.putProperty("sequencefile.serde", TextSerializerDeserializer.class.getName()); if (codec.equals(SnappyCodec.class)) { meta.putProperty(OrcConf.COMPRESS.getAttribute(), "SNAPPY"); } else if (codec.equals(Lz4Codec.class)) { meta.putProperty(OrcConf.COMPRESS.getAttribute(), "ZLIB"); } else { meta.putProperty(OrcConf.COMPRESS.getAttribute(), "NONE"); } String fileName = "Compression_" + codec.getSimpleName(); Path tablePath = new Path(testDir, fileName); Appender appender = ((FileTablespace) TablespaceManager.getLocalFs()).getAppender(meta, schema, tablePath); appender.enableStats(); appender.init(); String extension = ""; if (appender instanceof DelimitedTextFile.DelimitedTextFileAppender) { extension = ((DelimitedTextFile.DelimitedTextFileAppender) appender).getExtension(); } int tupleNum = 1000; VTuple vTuple; for (int i = 0; i < tupleNum; i++) { vTuple = new VTuple(3); vTuple.put(0, DatumFactory.createInt4(i + 1)); vTuple.put(1, DatumFactory.createFloat4((float) i)); vTuple.put(2, DatumFactory.createText(String.valueOf(i))); appender.addTuple(vTuple); } appender.close(); TableStats stat = appender.getStats(); assertEquals(tupleNum, stat.getNumRows().longValue()); tablePath = tablePath.suffix(extension); FileStatus status = fs.getFileStatus(tablePath); long fileLen = status.getLen(); FileFragment[] tablets = new FileFragment[1]; tablets[0] = new FileFragment(fileName, tablePath, 0, fileLen); Scanner scanner = TablespaceManager.getLocalFs().getScanner(meta, schema, tablets[0], schema); scanner.init(); if (dataFormat.equalsIgnoreCase(BuiltinStorages.SEQUENCE_FILE)) { assertTrue(scanner instanceof SequenceFileScanner); Writable key = ((SequenceFileScanner) scanner).getKey(); assertEquals(key.getClass().getCanonicalName(), LongWritable.class.getCanonicalName()); } int tupleCnt = 0; while ((scanner.next()) != null) { tupleCnt++; } scanner.close(); assertEquals(tupleNum, tupleCnt); assertNotSame(appender.getStats().getNumBytes().longValue(), scanner.getInputStats().getNumBytes().longValue()); assertEquals(appender.getStats().getNumRows().longValue(), scanner.getInputStats().getNumRows().longValue()); }
Example #13
Source File: NiFiOrcUtils.java From nifi with Apache License 2.0 | 4 votes |
public static Writer createWriter( Path path, Configuration conf, TypeInfo orcSchema, long stripeSize, CompressionKind compress, int bufferSize) throws IOException { int rowIndexStride = (int) OrcConf.ROW_INDEX_STRIDE.getLong(conf); boolean addBlockPadding = OrcConf.BLOCK_PADDING.getBoolean(conf); String versionName = OrcConf.WRITE_FORMAT.getString(conf); OrcFile.Version versionValue = (versionName == null) ? OrcFile.Version.CURRENT : OrcFile.Version.byName(versionName); OrcFile.EncodingStrategy encodingStrategy; String enString = OrcConf.ENCODING_STRATEGY.getString(conf); if (enString == null) { encodingStrategy = OrcFile.EncodingStrategy.SPEED; } else { encodingStrategy = OrcFile.EncodingStrategy.valueOf(enString); } final double paddingTolerance = OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(conf); long blockSizeValue = OrcConf.BLOCK_SIZE.getLong(conf); double bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(conf); ObjectInspector inspector = OrcStruct.createObjectInspector(orcSchema); OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(conf) .rowIndexStride(rowIndexStride) .blockPadding(addBlockPadding) .version(versionValue) .encodingStrategy(encodingStrategy) .paddingTolerance(paddingTolerance) .blockSize(blockSizeValue) .bloomFilterFpp(bloomFilterFpp) .memory(getMemoryManager(conf)) .inspector(inspector) .stripeSize(stripeSize) .bufferSize(bufferSize) .compress(compress); return OrcFile.createWriter(path, writerOptions); }
Example #14
Source File: OrcKeyComparatorTest.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Test public void testComplexRecordUnion() throws Exception { OrcKeyComparator comparator = new OrcKeyComparator(); Configuration conf = new Configuration(); TypeDescription listSchema = TypeDescription.createList(TypeDescription.createString()); TypeDescription nestedRecordSchema = TypeDescription.createStruct() .addField("x", TypeDescription.createInt()) .addField("y", TypeDescription.createInt()); TypeDescription unionSchema = TypeDescription.createUnion() .addUnionChild(TypeDescription.createInt()) .addUnionChild(listSchema) .addUnionChild(nestedRecordSchema); TypeDescription schema = TypeDescription.createStruct() .addField("a", TypeDescription.createInt()) .addField("b", unionSchema); conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), schema.toString()); Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), schema.toString()); comparator.setConf(conf); // base record OrcStruct record0 = (OrcStruct) OrcStruct.createValue(schema); record0.setFieldValue("a", new IntWritable(1)); OrcStruct nestedRecord0 = createSimpleOrcStruct(nestedRecordSchema, 1, 2); OrcUnion orcUnion0 = createOrcUnion(unionSchema, nestedRecord0); record0.setFieldValue("b", orcUnion0); // same content as base record in diff objects. OrcStruct record1 = (OrcStruct) OrcStruct.createValue(schema); record1.setFieldValue("a", new IntWritable(1)); OrcStruct nestedRecord1 = createSimpleOrcStruct(nestedRecordSchema, 1, 2); OrcUnion orcUnion1 = createOrcUnion(unionSchema, nestedRecord1); record1.setFieldValue("b", orcUnion1); // diff records inside union, record0 == record1 < 2 OrcStruct record2 = (OrcStruct) OrcStruct.createValue(schema); record2.setFieldValue("a", new IntWritable(1)); OrcStruct nestedRecord2 = createSimpleOrcStruct(nestedRecordSchema, 2, 2); OrcUnion orcUnion2 = createOrcUnion(unionSchema, nestedRecord2); record2.setFieldValue("b", orcUnion2); // differ in list inside union, record3 < record4 == record5 OrcStruct record3 = (OrcStruct) OrcStruct.createValue(schema); record3.setFieldValue("a", new IntWritable(1)); OrcList orcList3 = createOrcList(5, listSchema, 2); OrcUnion orcUnion3 = createOrcUnion(unionSchema, orcList3); record3.setFieldValue("b", orcUnion3); OrcStruct record4 = (OrcStruct) OrcStruct.createValue(schema); record4.setFieldValue("a", new IntWritable(1)); OrcList orcList4 = createOrcList(6, listSchema, 2); OrcUnion orcUnion4 = createOrcUnion(unionSchema, orcList4); record4.setFieldValue("b", orcUnion4); OrcStruct record5 = (OrcStruct) OrcStruct.createValue(schema); record5.setFieldValue("a", new IntWritable(1)); OrcList orcList5 = createOrcList(6, listSchema, 2); OrcUnion orcUnion5 = createOrcUnion(unionSchema, orcList5); record5.setFieldValue("b", orcUnion5); OrcKey orcKey0 = new OrcKey(); orcKey0.key = record0; OrcKey orcKey1 = new OrcKey(); orcKey1.key = record1; OrcKey orcKey2 = new OrcKey(); orcKey2.key = record2; OrcKey orcKey3 = new OrcKey(); orcKey3.key = record3; OrcKey orcKey4 = new OrcKey(); orcKey4.key = record4; OrcKey orcKey5 = new OrcKey(); orcKey5.key = record5; Assert.assertEquals(orcUnion0, orcUnion1); // Int value in orcKey2 is larger Assert.assertTrue(comparator.compare(orcKey0, orcKey2) < 0); Assert.assertTrue(comparator.compare(orcKey3, orcKey4) < 0 ); Assert.assertTrue(comparator.compare(orcKey3, orcKey5) < 0); Assert.assertTrue(comparator.compare(orcKey4, orcKey5) == 0); }
Example #15
Source File: OrcKeyComparatorTest.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Test public void testComplexRecordMap() throws Exception { OrcKeyComparator comparator = new OrcKeyComparator(); Configuration conf = new Configuration(); TypeDescription mapFieldSchema = TypeDescription.createMap(TypeDescription.createString(), TypeDescription.createString()); TypeDescription schema = TypeDescription.createStruct().addField("a", TypeDescription.createInt()).addField("b", mapFieldSchema); conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), schema.toString()); Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), schema.toString()); comparator.setConf(conf); // base record OrcStruct record0 = (OrcStruct) OrcStruct.createValue(schema); record0.setFieldValue("a", new IntWritable(1)); OrcMap orcMap = createSimpleOrcMap(new Text("key"), new Text("value"), mapFieldSchema); record0.setFieldValue("b", orcMap); // key value both differ OrcStruct record1 = (OrcStruct) OrcStruct.createValue(schema); record1.setFieldValue("a", new IntWritable(1)); OrcMap orcMap1 = createSimpleOrcMap(new Text("key_key"), new Text("value_value"), mapFieldSchema); record1.setFieldValue("b", orcMap1); // Key same, value differ OrcStruct record2 = (OrcStruct) OrcStruct.createValue(schema); record2.setFieldValue("a", new IntWritable(1)); OrcMap orcMap2 = createSimpleOrcMap(new Text("key"), new Text("value_value"), mapFieldSchema); record2.setFieldValue("b", orcMap2); // Same as base OrcStruct record3 = (OrcStruct) OrcStruct.createValue(schema); record3.setFieldValue("a", new IntWritable(1)); OrcMap orcMap3 = createSimpleOrcMap(new Text("key"), new Text("value"), mapFieldSchema); record3.setFieldValue("b", orcMap3); // Differ in other field. OrcStruct record4 = (OrcStruct) OrcStruct.createValue(schema); record4.setFieldValue("a", new IntWritable(2)); record4.setFieldValue("b", orcMap); // Record with map containing multiple entries but inserted in different order. OrcStruct record6 = (OrcStruct) OrcStruct.createValue(schema); record6.setFieldValue("a", new IntWritable(1)); OrcMap orcMap6 = createSimpleOrcMap(new Text("key"), new Text("value"), mapFieldSchema); orcMap6.put(new Text("keyLater"), new Text("valueLater")); record6.setFieldValue("b", orcMap6); OrcStruct record7 = (OrcStruct) OrcStruct.createValue(schema); record7.setFieldValue("a", new IntWritable(1)); OrcMap orcMap7 = createSimpleOrcMap(new Text("keyLater"), new Text("valueLater"), mapFieldSchema); orcMap7.put(new Text("key"), new Text("value")); record7.setFieldValue("b", orcMap7); OrcKey orcKey0 = new OrcKey(); orcKey0.key = record0; OrcKey orcKey1 = new OrcKey(); orcKey1.key = record1; OrcKey orcKey2 = new OrcKey(); orcKey2.key = record2; OrcKey orcKey3 = new OrcKey(); orcKey3.key = record3; OrcKey orcKey4 = new OrcKey(); orcKey4.key = record4; OrcKey orcKey6 = new OrcKey(); orcKey6.key = record6; OrcKey orcKey7 = new OrcKey(); orcKey7.key = record7; Assert.assertTrue(comparator.compare(orcKey0, orcKey1) < 0); Assert.assertTrue(comparator.compare(orcKey1, orcKey2) > 0); Assert.assertTrue(comparator.compare(orcKey2, orcKey3) > 0); Assert.assertTrue(comparator.compare(orcKey0, orcKey3) == 0); Assert.assertTrue(comparator.compare(orcKey0, orcKey4) < 0); Assert.assertTrue(comparator.compare(orcKey6, orcKey7) == 0); }
Example #16
Source File: OrcKeyComparatorTest.java From incubator-gobblin with Apache License 2.0 | 4 votes |
@Test public void testComplexRecordArray() throws Exception { OrcKeyComparator comparator = new OrcKeyComparator(); Configuration conf = new Configuration(); TypeDescription listSchema = TypeDescription.createList(TypeDescription.createString()); TypeDescription schema = TypeDescription.createStruct().addField("a", TypeDescription.createInt()).addField("b", listSchema); conf.set(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute(), schema.toString()); Assert.assertEquals(conf.get(OrcConf.MAPRED_SHUFFLE_KEY_SCHEMA.getAttribute()), schema.toString()); comparator.setConf(conf); // base record OrcStruct record0 = (OrcStruct) OrcStruct.createValue(schema); record0.setFieldValue("a", new IntWritable(1)); OrcList orcList0 = createOrcList(3, listSchema, 3); record0.setFieldValue("b", orcList0); // the same as base but different object, expecting equal to each other. OrcStruct record1 = (OrcStruct) OrcStruct.createValue(schema); record1.setFieldValue("a", new IntWritable(1)); OrcList orcList1 = createOrcList(3, listSchema, 3); record1.setFieldValue("b", orcList1); // Diff in int field OrcStruct record2 = (OrcStruct) OrcStruct.createValue(schema); record2.setFieldValue("a", new IntWritable(2)); OrcList orcList2 = createOrcList(3, listSchema, 3); record2.setFieldValue("b", orcList2); // Diff in array field: 1 OrcStruct record3 = (OrcStruct) OrcStruct.createValue(schema); record3.setFieldValue("a", new IntWritable(1)); OrcList orcList3 = createOrcList(3, listSchema, 5); record3.setFieldValue("b", orcList3); // Diff in array field: 2 OrcStruct record4 = (OrcStruct) OrcStruct.createValue(schema); record4.setFieldValue("a", new IntWritable(1)); OrcList orcList4 = createOrcList(4, listSchema, 3); record4.setFieldValue("b", orcList4); OrcKey orcKey0 = new OrcKey(); orcKey0.key = record0; OrcKey orcKey1 = new OrcKey(); orcKey1.key = record1; OrcKey orcKey2 = new OrcKey(); orcKey2.key = record2; OrcKey orcKey3 = new OrcKey(); orcKey3.key = record3; OrcKey orcKey4 = new OrcKey(); orcKey4.key = record4; Assert.assertTrue(comparator.compare(orcKey0, orcKey1) == 0); Assert.assertTrue(comparator.compare(orcKey1, orcKey2) < 0); Assert.assertTrue(comparator.compare(orcKey1, orcKey3) < 0); Assert.assertTrue(comparator.compare(orcKey1, orcKey4) < 0); }
Example #17
Source File: OrcShimV200.java From flink with Apache License 2.0 | 4 votes |
protected Reader.Options readOrcConf(Reader.Options options, Configuration conf) { return options.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf)) .skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf)); }
Example #18
Source File: OrcShimV230.java From flink with Apache License 2.0 | 4 votes |
@Override protected Reader.Options readOrcConf(Reader.Options options, Configuration conf) { return super.readOrcConf(options, conf) .tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf)); }
Example #19
Source File: OrcRowInputFormat.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
@Override public void open(FileInputSplit fileSplit) throws IOException { LOG.debug("Opening ORC file {}", fileSplit.getPath()); // open ORC file and create reader org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(fileSplit.getPath().getPath()); Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf)); // get offset and length for the stripes that start in the split Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(fileSplit, getStripes(orcReader)); // create ORC row reader configuration Reader.Options options = getOptions(orcReader) .schema(schema) .range(offsetAndLength.f0, offsetAndLength.f1) .useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf)) .skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf)) .tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf)); // configure filters if (!conjunctPredicates.isEmpty()) { SearchArgument.Builder b = SearchArgumentFactory.newBuilder(); b = b.startAnd(); for (Predicate predicate : conjunctPredicates) { predicate.add(b); } b = b.end(); options.searchArgument(b.build(), new String[]{}); } // configure selected fields options.include(computeProjectionMask()); // create ORC row reader this.orcRowsReader = orcReader.rows(options); // assign ids this.schema.getId(); // create row batch this.rowBatch = schema.createRowBatch(batchSize); rowsInBatch = 0; nextRow = 0; }
Example #20
Source File: HiveConfFactory.java From dremio-oss with Apache License 2.0 | 4 votes |
/** * Fills in a HiveConf instance with any user provided configuration parameters * * @param hiveConf - the conf to fill in * @param config - the user provided parameters * @return */ protected static void addUserProperties(HiveConf hiveConf, BaseHiveStoragePluginConfig<?,?> config) { // Used to capture properties set by user final Set<String> userPropertyNames = new HashSet<>(); if(config.propertyList != null) { for(Property prop : config.propertyList) { userPropertyNames.add(prop.name); setConf(hiveConf, prop.name, prop.value); if(logger.isTraceEnabled()){ logger.trace("HiveConfig Override {}={}", prop.name, prop.value); } } } // Check if zero-copy has been set by user boolean zeroCopySetByUser = userPropertyNames.contains(OrcConf.USE_ZEROCOPY.getAttribute()); // Configure zero-copy for ORC reader if (!zeroCopySetByUser) { if (VM.isWindowsHost() || VM.isMacOSHost()) { logger.debug("MacOS or Windows host detected. Not automatically enabling ORC zero-copy feature"); } else { String fs = hiveConf.get(FileSystem.FS_DEFAULT_NAME_KEY); // Equivalent to a case-insensitive startsWith... if (fs.regionMatches(true, 0, "maprfs", 0, 6)) { // DX-12672: do not enable ORC zero-copy on MapRFS logger.debug("MapRFS detected. Not automatically enabling ORC zero-copy feature"); } else { logger.debug("Linux host detected. Enabling ORC zero-copy feature"); hiveConf.set(OrcConf.USE_ZEROCOPY.getAttribute(), "true"); } } } else { boolean useZeroCopy = OrcConf.USE_ZEROCOPY.getBoolean(hiveConf); if (useZeroCopy) { logger.warn("ORC zero-copy feature has been manually enabled. This is not recommended."); } else { logger.error("ORC zero-copy feature has been manually disabled. This is not recommended and might cause memory issues"); } } // Check if fs.s3.impl has been set by user boolean fsS3ImplSetByUser = userPropertyNames.contains(FS_S3_IMPL); if (fsS3ImplSetByUser) { logger.warn(FS_S3_IMPL + " manually set. This is not recommended."); } else { logger.debug("Setting " + FS_S3_IMPL + " to " + FS_S3_IMPL_DEFAULT); setConf(hiveConf, FS_S3_IMPL, FS_S3_IMPL_DEFAULT); } ADL_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue())); WASB_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue())); ABFS_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue())); }
Example #21
Source File: ScanWithHiveReader.java From dremio-oss with Apache License 2.0 | 4 votes |
private static Class<? extends HiveAbstractReader> getNativeReaderClass(Optional<String> formatName, OptionManager options, Configuration configuration, boolean mixedSchema, boolean isTransactional) { if (!formatName.isPresent()) { return HiveDefaultReader.class; } Class<? extends HiveAbstractReader> readerClass = readerMap.get(formatName.get()); if (readerClass == HiveOrcReader.class) { // Validate reader if (OrcConf.USE_ZEROCOPY.getBoolean(configuration)) { if (!NativeCodeLoader.isNativeCodeLoaded()) { throw UserException.dataReadError() .message("Hadoop native library is required for Hive ORC data, but is not loaded").build(logger); } // TODO: find a way to access compression codec information? if (!SnappyDecompressor.isNativeCodeLoaded()) { throw UserException.dataReadError() .message("Snappy native library is required for Hive ORC data, but is not loaded").build(logger); } if (!isNativeZlibLoaded) { throw UserException .dataReadError() .message("Zlib native library is required for Hive ORC data, but is not loaded") .build(logger); } } if (new HiveSettings(options).vectorizeOrcReaders() && !mixedSchema && !isTransactional) { // We don't use vectorized ORC reader if there is a schema change between table and partitions or the table is // a transactional Hive table return HiveORCVectorizedReader.class; } } if (readerClass == null) { return HiveDefaultReader.class; } return readerClass; }
Example #22
Source File: HiveConfFactory.java From dremio-oss with Apache License 2.0 | 4 votes |
/** * Fills in a HiveConf instance with any user provided configuration parameters * * @param hiveConf - the conf to fill in * @param config - the user provided parameters * @return */ protected static void addUserProperties(HiveConf hiveConf, BaseHiveStoragePluginConfig<?,?> config) { // Used to capture properties set by user final Set<String> userPropertyNames = new HashSet<>(); if(config.propertyList != null) { for(Property prop : config.propertyList) { userPropertyNames.add(prop.name); setConf(hiveConf, prop.name, prop.value); if(logger.isTraceEnabled()){ logger.trace("HiveConfig Override {}={}", prop.name, prop.value); } } } // Check if zero-copy has been set by user boolean zeroCopySetByUser = userPropertyNames.contains(OrcConf.USE_ZEROCOPY.getAttribute()) || userPropertyNames.contains(HiveConf.ConfVars.HIVE_ORC_ZEROCOPY.varname); // Configure zero-copy for ORC reader if (!zeroCopySetByUser) { if (VM.isWindowsHost() || VM.isMacOSHost()) { logger.debug("MacOS or Windows host detected. Not automatically enabling ORC zero-copy feature"); } else { String fs = hiveConf.get(FileSystem.FS_DEFAULT_NAME_KEY); // Equivalent to a case-insensitive startsWith... if (fs.regionMatches(true, 0, "maprfs", 0, 6)) { // DX-12672: do not enable ORC zero-copy on MapRFS logger.debug("MapRFS detected. Not automatically enabling ORC zero-copy feature"); } else { logger.debug("Linux host detected. Enabling ORC zero-copy feature"); setConf(hiveConf, HiveConf.ConfVars.HIVE_ORC_ZEROCOPY, true); } } } else { boolean useZeroCopy = OrcConf.USE_ZEROCOPY.getBoolean(hiveConf); if (useZeroCopy) { logger.warn("ORC zero-copy feature has been manually enabled. This is not recommended."); } else { logger.error("ORC zero-copy feature has been manually disabled. This is not recommended and might cause memory issues"); } } // Check if ORC Footer cache has been configured by user boolean orcStripCacheSetByUser = userPropertyNames.contains(HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_SIZE.varname); if (orcStripCacheSetByUser) { logger.error("ORC stripe details cache has been manually configured. This is not recommended and might cause memory issues"); } else { logger.debug("Disabling ORC stripe details cache."); setConf(hiveConf, HiveConf.ConfVars.HIVE_ORC_CACHE_STRIPE_DETAILS_SIZE, 0); } // Check if fs.s3.impl has been set by user boolean fsS3ImplSetByUser = userPropertyNames.contains(FS_S3_IMPL); if (fsS3ImplSetByUser) { logger.warn(FS_S3_IMPL + " manually set. This is not recommended."); } else { logger.debug("Setting " + FS_S3_IMPL + " to " + FS_S3_IMPL_DEFAULT); setConf(hiveConf, FS_S3_IMPL, FS_S3_IMPL_DEFAULT); } ADL_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue())); WASB_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue())); ABFS_PROPS.entrySet().asList().forEach(entry->setConf(hiveConf, entry.getKey(), entry.getValue())); }
Example #23
Source File: ScanWithHiveReader.java From dremio-oss with Apache License 2.0 | 4 votes |
private static Class<? extends HiveAbstractReader> getNativeReaderClass(Optional<String> formatName, OptionManager options, Configuration configuration, boolean mixedSchema, boolean isTransactional) { if (!formatName.isPresent()) { return HiveDefaultReader.class; } Class<? extends HiveAbstractReader> readerClass = readerMap.get(formatName.get()); if (readerClass == HiveOrcReader.class) { // Validate reader if (OrcConf.USE_ZEROCOPY.getBoolean(configuration)) { if (!NativeCodeLoader.isNativeCodeLoaded()) { throw UserException.dataReadError() .message("Hadoop native library is required for Hive ORC data, but is not loaded").build(logger); } // TODO: find a way to access compression codec information? if (!SnappyDecompressor.isNativeCodeLoaded()) { throw UserException.dataReadError() .message("Snappy native library is required for Hive ORC data, but is not loaded").build(logger); } if (!isNativeZlibLoaded) { throw UserException .dataReadError() .message("Zlib native library is required for Hive ORC data, but is not loaded") .build(logger); } } if (new HiveSettings(options).vectorizeOrcReaders() && !mixedSchema && !isTransactional) { // We don't use vectorized ORC reader if there is a schema change between table and partitions or the table is // a transactional Hive table return HiveORCVectorizedReader.class; } } if (readerClass == null) { return HiveDefaultReader.class; } return readerClass; }
Example #24
Source File: ORC.java From iceberg with Apache License 2.0 | 4 votes |
public ReadBuilder caseSensitive(boolean newCaseSensitive) { OrcConf.IS_SCHEMA_EVOLUTION_CASE_SENSITIVE.setBoolean(this.conf, newCaseSensitive); this.caseSensitive = newCaseSensitive; return this; }
Example #25
Source File: ORC.java From iceberg with Apache License 2.0 | 4 votes |
public WriteBuilder overwrite(boolean enabled) { OrcConf.OVERWRITE_OUTPUT_FILE.setBoolean(conf, enabled); return this; }
Example #26
Source File: OrcRowInputFormat.java From flink with Apache License 2.0 | 4 votes |
@Override public void open(FileInputSplit fileSplit) throws IOException { LOG.debug("Opening ORC file {}", fileSplit.getPath()); // open ORC file and create reader org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(fileSplit.getPath().getPath()); Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf)); // get offset and length for the stripes that start in the split Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(fileSplit, getStripes(orcReader)); // create ORC row reader configuration Reader.Options options = getOptions(orcReader) .schema(schema) .range(offsetAndLength.f0, offsetAndLength.f1) .useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf)) .skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf)) .tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf)); // configure filters if (!conjunctPredicates.isEmpty()) { SearchArgument.Builder b = SearchArgumentFactory.newBuilder(); b = b.startAnd(); for (Predicate predicate : conjunctPredicates) { predicate.add(b); } b = b.end(); options.searchArgument(b.build(), new String[]{}); } // configure selected fields options.include(computeProjectionMask()); // create ORC row reader this.orcRowsReader = orcReader.rows(options); // assign ids this.schema.getId(); // create row batch this.rowBatch = schema.createRowBatch(batchSize); rowsInBatch = 0; nextRow = 0; }