org.apache.orc.Reader Java Examples

The following examples show how to use org.apache.orc.Reader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: OrcIterable.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private static VectorizedRowBatchIterator newOrcIterator(InputFile file,
                                                         TypeDescription readerSchema,
                                                         Long start, Long length,
                                                         Reader orcFileReader, SearchArgument sarg) {
  final Reader.Options options = orcFileReader.options();
  if (start != null) {
    options.range(start, length);
  }
  options.schema(readerSchema);
  options.searchArgument(sarg, new String[]{});

  try {
    return new VectorizedRowBatchIterator(file.location(), readerSchema, orcFileReader.rows(options));
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to get ORC rows for file: %s", file);
  }
}
 
Example #2
Source File: OrcShimV200.java    From flink with Apache License 2.0 6 votes vote down vote up
protected Reader createReader(Path path, Configuration conf) throws IOException {
	try {
		Class orcFileClass = Class.forName("org.apache.hadoop.hive.ql.io.orc.OrcFile");
		Object readerOptions = invokeStaticMethod(orcFileClass, "readerOptions", conf);

		Class readerClass = Class.forName("org.apache.hadoop.hive.ql.io.orc.ReaderImpl");
		//noinspection unchecked
		return (Reader) invokeConstructor(readerClass, path, readerOptions);
	} catch (ClassNotFoundException |
			NoSuchMethodException |
			IllegalAccessException |
			InstantiationException |
			InvocationTargetException e) {
		throw new IOException(e);
	}
}
 
Example #3
Source File: OrcFileSystemITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
public void testNonPartition() {
	super.testNonPartition();

	// test configure success
	File directory = new File(URI.create(resultPath()).getPath());
	File[] files = directory.listFiles((dir, name) ->
			!name.startsWith(".") && !name.startsWith("_"));
	Assert.assertNotNull(files);
	Path path = new Path(URI.create(files[0].getAbsolutePath()));

	try {
		Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(new Configuration()));
		if (configure) {
			Assert.assertEquals("SNAPPY", reader.getCompressionKind().toString());
		} else {
			Assert.assertEquals("ZLIB", reader.getCompressionKind().toString());
		}
	} catch (IOException e) {
		throw new RuntimeException(e);
	}
}
 
Example #4
Source File: OrcRowInputFormatTest.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Test
public void testSplitStripesGivenSplits() throws IOException {
	rowOrcInputFormat =
		new OrcRowInputFormat(getPath(TEST_FILE_FLAT), TEST_SCHEMA_FLAT, new Configuration());

	OrcRowInputFormat spy = spy(rowOrcInputFormat);

	// mock options to check configuration of ORC reader
	Reader.Options options = spy(new Reader.Options());
	doReturn(options).when(spy).getOptions(any());

	FileInputSplit[] splits = spy.createInputSplits(3);

	spy.openInputFormat();
	spy.open(splits[0]);
	verify(options).range(eq(3L), eq(137005L));
	spy.open(splits[1]);
	verify(options).range(eq(137008L), eq(136182L));
	spy.open(splits[2]);
	verify(options).range(eq(273190L), eq(123633L));
}
 
Example #5
Source File: PentahoOrcRecordReader.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
static Reader getReader( String fileName, Configuration conf ) {

    try {
      S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( fileName, conf );
      Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( fileName ) );
      FileSystem fs = FileSystem.get( filePath.toUri(), conf );
      if ( !fs.exists( filePath ) ) {
        throw new NoSuchFileException( fileName );
      }
      if ( fs.getFileStatus( filePath ).isDirectory() ) {
        PathFilter pathFilter = file -> file.getName().endsWith( ".orc" );

        FileStatus[] fileStatuses = fs.listStatus( filePath, pathFilter );
        if ( fileStatuses.length == 0 ) {
          throw new NoSuchFileException( fileName );
        }
        filePath = fileStatuses[ 0 ].getPath();
      }
      return OrcFile.createReader( filePath,
        OrcFile.readerOptions( conf ).filesystem( fs ) );
    } catch ( IOException e ) {
      throw new IllegalArgumentException( "Unable to read data from file " + fileName, e );
    }
  }
 
Example #6
Source File: ORC.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public OrcIterator build() {
  Preconditions.checkNotNull(schema, "Schema is required");
  try {
    Path path = new Path(file.location());
    Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
    ColumnIdMap columnIds = new ColumnIdMap();
    TypeDescription orcSchema = TypeConversion.toOrc(schema, columnIds);
    Reader.Options options = reader.options();
    if (start != null) {
      options.range(start, length);
    }
    options.schema(orcSchema);
    return new OrcIterator(path, orcSchema, reader.rows(options));
  } catch (IOException e) {
    throw new RuntimeException("Can't open " + file.location(), e);
  }
}
 
Example #7
Source File: OrcBulkWriterTestUtil.java    From flink with Apache License 2.0 6 votes vote down vote up
public static void validate(File files, List<Record> expected) throws IOException {
	final File[] buckets = files.listFiles();
	assertNotNull(buckets);
	assertEquals(1, buckets.length);

	final File[] partFiles = buckets[0].listFiles();
	assertNotNull(partFiles);

	for (File partFile : partFiles) {
		assertTrue(partFile.length() > 0);

		OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(new Configuration());
		Reader reader = OrcFile.createReader(new org.apache.hadoop.fs.Path(partFile.toURI()), readerOptions);

		assertEquals(3, reader.getNumberOfRows());
		assertEquals(2, reader.getSchema().getFieldNames().size());
		assertSame(reader.getCompressionKind(), CompressionKind.LZ4);
		assertTrue(reader.hasMetadataValue(USER_METADATA_KEY));
		assertTrue(reader.getMetadataKeys().contains(USER_METADATA_KEY));

		List<Record> results = getResults(reader);

		assertEquals(3, results.size());
		assertEquals(results, expected);
	}
}
 
Example #8
Source File: OrcCompactionTaskTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
/**
 * Read a output ORC compacted file into memory.
 * This only works if fields are int value.
 */
public List<OrcStruct> readOrcFile(Path orcFilePath)
    throws IOException, InterruptedException {
  ReaderImpl orcReader = new ReaderImpl(orcFilePath, new OrcFile.ReaderOptions(new Configuration()));

  Reader.Options options = new Reader.Options().schema(orcReader.getSchema());
  OrcMapreduceRecordReader recordReader = new OrcMapreduceRecordReader(orcReader, options);
  List<OrcStruct> result = new ArrayList<>();

  OrcStruct recordContainer;
  while (recordReader.nextKeyValue()) {
    recordContainer = (OrcStruct) OrcUtils.createValueRecursively(orcReader.getSchema());
    OrcUtils.upConvertOrcStruct((OrcStruct) recordReader.getCurrentValue(), recordContainer, orcReader.getSchema());
    result.add(recordContainer);
  }

  return result;
}
 
Example #9
Source File: TestMetricsRowGroupFilterTypes.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public void createOrcInputFile(List<Record> records) throws IOException {
  if (ORC_FILE.exists()) {
    Assert.assertTrue(ORC_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(ORC_FILE);
  try (FileAppender<Record> appender = ORC.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    appender.addAll(records);
  }

  InputFile inFile = Files.localInput(ORC_FILE);
  try (Reader reader = OrcFile.createReader(new Path(inFile.location()),
      OrcFile.readerOptions(new Configuration()))) {
    Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size());
  }

  ORC_FILE.deleteOnExit();
}
 
Example #10
Source File: OrcBulkWriterTestUtil.java    From flink with Apache License 2.0 6 votes vote down vote up
private static List<Record> getResults(Reader reader) throws IOException {
	List<Record> results = new ArrayList<>();

	RecordReader recordReader = reader.rows();
	VectorizedRowBatch batch = reader.getSchema().createRowBatch();

	while (recordReader.nextBatch(batch)) {
		BytesColumnVector stringVector = (BytesColumnVector)  batch.cols[0];
		LongColumnVector intVector = (LongColumnVector) batch.cols[1];
		for (int r = 0; r < batch.size; r++) {
			String name = new String(stringVector.vector[r], stringVector.start[r], stringVector.length[r]);
			int age = (int) intVector.vector[r];

			results.add(new Record(name, age));
		}
		recordReader.close();
	}

	return results;
}
 
Example #11
Source File: OrcRowInputFormatTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testSplitStripesGivenSplits() throws IOException {
	rowOrcInputFormat =
		new OrcRowInputFormat(getPath(TEST_FILE_FLAT), TEST_SCHEMA_FLAT, new Configuration());

	OrcRowInputFormat spy = spy(rowOrcInputFormat);

	// mock options to check configuration of ORC reader
	Reader.Options options = spy(new Reader.Options());
	doReturn(options).when(spy).getOptions(any());

	FileInputSplit[] splits = spy.createInputSplits(3);

	spy.openInputFormat();
	spy.open(splits[0]);
	verify(options).range(eq(3L), eq(137005L));
	spy.open(splits[1]);
	verify(options).range(eq(137008L), eq(136182L));
	spy.open(splits[2]);
	verify(options).range(eq(273190L), eq(123633L));
}
 
Example #12
Source File: OrcMetadataStat.java    From rainbow with Apache License 2.0 5 votes vote down vote up
/**
 * get the total uncompressed size of the orc files.
 *
 * @return
 */
@Override
public long getTotalSize()
{
    long size = 0;
    for (Reader reader : this.fileReaders)
    {
        // contentLength includes the header ('ORC') length which is 3 bytes.
        size += reader.getContentLength()-3;
    }
    return size;
}
 
Example #13
Source File: ORC.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static Reader newFileReader(String location, ReaderOptions readerOptions) {
  try {
    return OrcFile.createReader(new Path(location), readerOptions);
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to open file: %s", location);
  }
}
 
Example #14
Source File: ORC.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static Reader newFileReader(InputFile file, Configuration config) {
  ReaderOptions readerOptions = OrcFile.readerOptions(config).useUTCTimestamp(true);
  if (file instanceof HadoopInputFile) {
    readerOptions.filesystem(((HadoopInputFile) file).getFileSystem());
  }
  return newFileReader(file.location(), readerOptions);
}
 
Example #15
Source File: TestMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public void createOrcInputFile() throws IOException {
  if (orcFile.exists()) {
    Assert.assertTrue(orcFile.delete());
  }

  OutputFile outFile = Files.localOutput(orcFile);
  try (FileAppender<GenericRecord> appender = ORC.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    GenericRecord record = GenericRecord.create(FILE_SCHEMA);
    // create 50 records
    for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) {
      record.setField("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0
      record.setField("_no_stats_parquet", TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats
                                                                        // in Parquet, but will produce stats for ORC
      record.setField("_required", "req"); // required, always non-null
      record.setField("_all_nulls", null); // never non-null
      record.setField("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
      record.setField("_no_nulls", ""); // optional, but always non-null
      record.setField("_str", i + "str" + i);

      GenericRecord structNotNull = GenericRecord.create(_structFieldType);
      structNotNull.setField("_int_field", INT_MIN_VALUE + i);
      record.setField("_struct_not_null", structNotNull); // struct with int

      appender.add(record);
    }
  }

  InputFile inFile = Files.localInput(orcFile);
  try (Reader reader = OrcFile.createReader(new Path(inFile.location()),
      OrcFile.readerOptions(new Configuration()))) {
    Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size());
  }

  orcFile.deleteOnExit();
}
 
Example #16
Source File: OrcRowInputFormatTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testDecimalPredicate() throws Exception {
	rowOrcInputFormat =
		new OrcRowInputFormat(getPath(TEST_FILE_DECIMAL), TEST_SCHEMA_DECIMAL, new Configuration());

	rowOrcInputFormat.addPredicate(
		new OrcRowInputFormat.Not(
			// decimal pred
			new OrcRowInputFormat.Equals("_col0", PredicateLeaf.Type.DECIMAL, BigDecimal.valueOf(-1000.5))));

	FileInputSplit[] splits = rowOrcInputFormat.createInputSplits(1);
	rowOrcInputFormat.openInputFormat();

	// mock options to check configuration of ORC reader
	OrcRowInputFormat spy = spy(rowOrcInputFormat);
	Reader.Options options = new Reader.Options();
	doReturn(options).when(spy).getOptions(any());

	spy.openInputFormat();
	spy.open(splits[0]);

	// verify predicate configuration
	SearchArgument sarg = options.getSearchArgument();
	assertNotNull(sarg);
	assertEquals("(not leaf-0)", sarg.getExpression().toString());
	assertEquals(1, sarg.getLeaves().size());
	List<PredicateLeaf> leaves = sarg.getLeaves();
	assertEquals("(EQUALS _col0 -1000.5)", leaves.get(0).toString());
}
 
Example #17
Source File: TestOrcMetadata.java    From rainbow with Apache License 2.0 5 votes vote down vote up
@Test
public void test () throws IOException, Descriptors.DescriptorValidationException
{
    Configuration conf = new Configuration();
    System.setProperty("hadoop.home.dir", "/");
    FileSystem fileSystem = FileSystem.get(URI.create("hdfs://presto00:9000"), conf);
    Path hdfsDirPath = new Path("/rainbow2/orc_new_compress");
    System.out.println(fileSystem.isFile(hdfsDirPath));
    FileStatus[] fileStatuses = fileSystem.listStatus(hdfsDirPath);
    System.out.println(fileStatuses.length);
    for (FileStatus status : fileStatuses)
    {
        status.getPath();
        System.out.println(status.getPath() + ", " + status.getLen());
    }

    Reader reader = OrcFile.createReader(fileStatuses[0].getPath(),
            OrcFile.readerOptions(conf));
    System.out.println("file length:" + reader.getFileTail().getFileLength());
    List<String> columnNames = new ArrayList<>();
    columnNames.add("samplepercent");
    System.out.println(reader.getRawDataSizeOfColumns(columnNames));
    System.out.println(reader.getFileTail().getFooter().getTypes(0).getFieldNames(0));
    System.out.println(reader.getTypes().get(0).getSerializedSize());

    List<Reader> readers = new ArrayList<>();
    for (FileStatus fileStatus : fileStatuses)
    {
        Reader reader1 = OrcFile.createReader(fileStatus.getPath(),
                OrcFile.readerOptions(conf));
        readers.add(reader1);
        System.out.println("content size: " + reader1.getContentLength() + ", raw size: "
        + reader1.getRawDataSize());
    }

    for (String columnName : reader.getSchema().getFieldNames())
    {
        System.out.println(columnName);
    }
}
 
Example #18
Source File: OrcNoHiveShim.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public RecordReader createRecordReader(
		Configuration conf,
		TypeDescription schema,
		int[] selectedFields,
		List<OrcSplitReader.Predicate> conjunctPredicates,
		org.apache.flink.core.fs.Path path,
		long splitStart,
		long splitLength) throws IOException {
	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(path.toUri());

	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(
			splitStart, splitLength, orcReader.getStripes());

	// create ORC row reader configuration
	Reader.Options options = new Reader.Options()
			.schema(schema)
			.range(offsetAndLength.f0, offsetAndLength.f1)
			.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
			.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
			.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// TODO configure filters

	// configure selected fields
	options.include(computeProjectionMask(schema, selectedFields));

	// create ORC row reader
	RecordReader orcRowsReader = orcReader.rows(options);

	// assign ids
	schema.getId();

	return orcRowsReader;
}
 
Example #19
Source File: OrcShimV200.java    From flink with Apache License 2.0 5 votes vote down vote up
protected RecordReader createRecordReader(Reader reader, Reader.Options options) throws IOException {
	try {
		return (RecordReader) invokeExactMethod(reader, "rowsOptions", options);
	} catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) {
		throw new IOException(e);
	}
}
 
Example #20
Source File: OrcFileAppender.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public List<Long> splitOffsets() {
  Preconditions.checkState(isClosed, "File is not yet closed");
  try (Reader reader = ORC.newFileReader(file.toInputFile(), conf)) {
    List<StripeInformation> stripes = reader.getStripes();
    return Collections.unmodifiableList(Lists.transform(stripes, StripeInformation::getOffset));
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Can't close ORC reader %s", file.location());
  }
}
 
Example #21
Source File: OrcMetrics.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static Metrics fromInputFile(InputFile file, Configuration config) {
  try (Reader orcReader = ORC.newFileReader(file, config)) {
    return buildOrcMetrics(orcReader.getNumberOfRows(), orcReader.getSchema(), orcReader.getStatistics());
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to open file: %s", file.location());
  }
}
 
Example #22
Source File: OrcRowInputFormatTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testTimePredicates() throws Exception {
	rowOrcInputFormat =
		new OrcRowInputFormat(getPath(TEST_FILE_TIMETYPES), TEST_SCHEMA_TIMETYPES, new Configuration());

	rowOrcInputFormat.addPredicate(
		// OR
		new OrcRowInputFormat.Or(
			// timestamp pred
			new OrcRowInputFormat.Equals("time", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("1900-05-05 12:34:56.100")),
			// date pred
			new OrcRowInputFormat.Equals("date", PredicateLeaf.Type.DATE, Date.valueOf("1900-12-25")))
		);

	FileInputSplit[] splits = rowOrcInputFormat.createInputSplits(1);
	rowOrcInputFormat.openInputFormat();

	// mock options to check configuration of ORC reader
	OrcRowInputFormat spy = spy(rowOrcInputFormat);
	Reader.Options options = new Reader.Options();
	doReturn(options).when(spy).getOptions(any());

	spy.openInputFormat();
	spy.open(splits[0]);

	// verify predicate configuration
	SearchArgument sarg = options.getSearchArgument();
	assertNotNull(sarg);
	assertEquals("(or leaf-0 leaf-1)", sarg.getExpression().toString());
	assertEquals(2, sarg.getLeaves().size());
	List<PredicateLeaf> leaves = sarg.getLeaves();
	assertEquals("(EQUALS time 1900-05-05 12:34:56.1)", leaves.get(0).toString());
	assertEquals("(EQUALS date 1900-12-25)", leaves.get(1).toString());
}
 
Example #23
Source File: OrcRowInputFormatTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testProjectionMaskNested() throws IOException{
	rowOrcInputFormat =
		new OrcRowInputFormat(getPath(TEST_FILE_NESTED), TEST_SCHEMA_NESTED, new Configuration());

	OrcRowInputFormat spy = spy(rowOrcInputFormat);

	// mock options to check configuration of ORC reader
	Reader.Options options = new Reader.Options();
	doReturn(options).when(spy).getOptions(any());

	spy.selectFields(9, 11, 2);
	spy.openInputFormat();
	FileInputSplit[] splits = spy.createInputSplits(1);
	spy.open(splits[0]);

	// top-level struct is false
	boolean[] expected = new boolean[]{
		false, // top level
		false, false, // flat fields 0, 1 are out
		true, // flat field 2 is in
		false, false, false, false, false, false, // flat fields 3, 4, 5, 6, 7, 8 are out
		true, true, true, true, true, // nested field 9 is in
		false, false, false, false, // nested field 10 is out
		true, true, true, true, true}; // nested field 11 is in
	assertArrayEquals(expected, options.getInclude());
}
 
Example #24
Source File: JsonORCFileReaderWriterFactory.java    From secor with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public JsonORCFileReader(LogFilePath logFilePath, CompressionCodec codec)
        throws IOException {
    schema = schemaProvider.getSchema(logFilePath.getTopic(),
            logFilePath);
    Path path = new Path(logFilePath.getLogFilePath());
    Reader reader = OrcFile.createReader(path,
            OrcFile.readerOptions(new Configuration(true)));
    offset = logFilePath.getOffset();
    rows = reader.rows();
    batch = reader.getSchema().createRowBatch();
    rows.nextBatch(batch);
}
 
Example #25
Source File: OrcRowInputFormatTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testDecimalPredicate() throws Exception {
	rowOrcInputFormat =
		new OrcRowInputFormat(getPath(TEST_FILE_DECIMAL), TEST_SCHEMA_DECIMAL, new Configuration());

	rowOrcInputFormat.addPredicate(
		new OrcRowInputFormat.Not(
			// decimal pred
			new OrcRowInputFormat.Equals("_col0", PredicateLeaf.Type.DECIMAL, BigDecimal.valueOf(-1000.5))));

	FileInputSplit[] splits = rowOrcInputFormat.createInputSplits(1);
	rowOrcInputFormat.openInputFormat();

	// mock options to check configuration of ORC reader
	OrcRowInputFormat spy = spy(rowOrcInputFormat);
	Reader.Options options = new Reader.Options();
	doReturn(options).when(spy).getOptions(any());

	spy.openInputFormat();
	spy.open(splits[0]);

	// verify predicate configuration
	SearchArgument sarg = options.getSearchArgument();
	assertNotNull(sarg);
	assertEquals("(not leaf-0)", sarg.getExpression().toString());
	assertEquals(1, sarg.getLeaves().size());
	List<PredicateLeaf> leaves = sarg.getLeaves();
	assertEquals("(EQUALS _col0 -1000.5)", leaves.get(0).toString());
}
 
Example #26
Source File: OrcRowInputFormatTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testTimePredicates() throws Exception {
	rowOrcInputFormat =
		new OrcRowInputFormat(getPath(TEST_FILE_TIMETYPES), TEST_SCHEMA_TIMETYPES, new Configuration());

	rowOrcInputFormat.addPredicate(
		// OR
		new OrcRowInputFormat.Or(
			// timestamp pred
			new OrcRowInputFormat.Equals("time", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("1900-05-05 12:34:56.100")),
			// date pred
			new OrcRowInputFormat.Equals("date", PredicateLeaf.Type.DATE, Date.valueOf("1900-12-25")))
		);

	FileInputSplit[] splits = rowOrcInputFormat.createInputSplits(1);
	rowOrcInputFormat.openInputFormat();

	// mock options to check configuration of ORC reader
	OrcRowInputFormat spy = spy(rowOrcInputFormat);
	Reader.Options options = new Reader.Options();
	doReturn(options).when(spy).getOptions(any());

	spy.openInputFormat();
	spy.open(splits[0]);

	// verify predicate configuration
	SearchArgument sarg = options.getSearchArgument();
	assertNotNull(sarg);
	assertEquals("(or leaf-0 leaf-1)", sarg.getExpression().toString());
	assertEquals(2, sarg.getLeaves().size());
	List<PredicateLeaf> leaves = sarg.getLeaves();
	assertEquals("(EQUALS time 1900-05-05 12:34:56.1)", leaves.get(0).toString());
	assertEquals("(EQUALS date 1900-12-25)", leaves.get(1).toString());
}
 
Example #27
Source File: PentahoOrcInputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 5 votes vote down vote up
private List<IOrcInputField> readSchema( Reader orcReader ) {
  OrcSchemaConverter orcSchemaConverter = new OrcSchemaConverter();
  List<IOrcInputField> orcInputFields = orcSchemaConverter.buildInputFields( readTypeDescription( orcReader ) );
  IOrcMetaData.Reader orcMetaDataReader = new OrcMetaDataReader( orcReader );
  orcMetaDataReader.read( orcInputFields );
  return orcInputFields;
}
 
Example #28
Source File: OrcRowInputFormatTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testProjectionMaskNested() throws IOException{
	rowOrcInputFormat =
		new OrcRowInputFormat(getPath(TEST_FILE_NESTED), TEST_SCHEMA_NESTED, new Configuration());

	OrcRowInputFormat spy = spy(rowOrcInputFormat);

	// mock options to check configuration of ORC reader
	Reader.Options options = new Reader.Options();
	doReturn(options).when(spy).getOptions(any());

	spy.selectFields(9, 11, 2);
	spy.openInputFormat();
	FileInputSplit[] splits = spy.createInputSplits(1);
	spy.open(splits[0]);

	// top-level struct is false
	boolean[] expected = new boolean[]{
		false, // top level
		false, false, // flat fields 0, 1 are out
		true, // flat field 2 is in
		false, false, false, false, false, false, // flat fields 3, 4, 5, 6, 7, 8 are out
		true, true, true, true, true, // nested field 9 is in
		false, false, false, false, // nested field 10 is out
		true, true, true, true, true}; // nested field 11 is in
	assertArrayEquals(expected, options.getInclude());
}
 
Example #29
Source File: OrcMetaDataReader.java    From pentaho-hadoop-shims with Apache License 2.0 4 votes vote down vote up
public OrcMetaDataReader( Reader reader ) {
  this.reader = reader;
}
 
Example #30
Source File: OrcUtils.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
public static Reader getRecordReaderFromFile(Configuration conf, Path orcFilePath)
    throws IOException {
  return OrcFile.createReader(orcFilePath, new OrcFile.ReaderOptions(conf));
}