org.apache.orc.OrcFile Java Examples

The following examples show how to use org.apache.orc.OrcFile. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PentahoOrcRecordWriter.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
public PentahoOrcRecordWriter( List<? extends IOrcOutputField> fields, TypeDescription schema, String filePath,
                               Configuration conf ) {
  this.fields = fields;
  this.schema = schema;
  final AtomicInteger fieldNumber = new AtomicInteger();  //Mutable field count
  fields.forEach( field -> setOutputMeta( fieldNumber, field ) );
  outputRowMetaAndData = new RowMetaAndData( outputRowMeta, new Object[ fieldNumber.get() ] );

  try {
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( filePath, conf );
    Path outputFile = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( filePath ) );
    writer = OrcFile.createWriter( outputFile,
      OrcFile.writerOptions( conf )
        .setSchema( schema ) );
    batch = schema.createRowBatch();
  } catch ( IOException e ) {
    logger.error( e );
  }

  //Write the addition metadata for the fields
  // new OrcMetaDataWriter( writer ).write( fields );
}
 
Example #2
Source File: PhysicalWriterImpl.java    From flink with Apache License 2.0 6 votes vote down vote up
public PhysicalWriterImpl(FSDataOutputStream out, OrcFile.WriterOptions opts) throws IOException {
	if (opts.isEnforceBufferSize()) {
		this.bufferSize = opts.getBufferSize();
	} else {
		this.bufferSize = getEstimatedBufferSize(
			opts.getStripeSize(), opts.getSchema().getMaximumId() + 1, opts.getBufferSize());
	}

	this.out = out;
	this.blockOffset = 0;
	this.blockSize = opts.getBlockSize();
	this.maxPadding = (int) (opts.getPaddingTolerance() * (double) opts.getBufferSize());
	this.compress = opts.getCompress();
	this.codec = OrcCodecPool.getCodec(this.compress);
	this.streams  = new TreeMap<>();
	this.writer = new OutStream("metadata", this.bufferSize, this.codec, new DirectStream(this.out));
	this.shims = opts.getHadoopShims();
	this.addBlockPadding = opts.getBlockPadding();
	this.protobufWriter = CodedOutputStream.newInstance(this.writer);
	this.writeVariableLengthBlocks = opts.getWriteVariableLengthBlocks();
}
 
Example #3
Source File: OrcFileSystemITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
public void testNonPartition() {
	super.testNonPartition();

	// test configure success
	File directory = new File(URI.create(resultPath()).getPath());
	File[] files = directory.listFiles((dir, name) ->
			!name.startsWith(".") && !name.startsWith("_"));
	Assert.assertNotNull(files);
	Path path = new Path(URI.create(files[0].getAbsolutePath()));

	try {
		Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(new Configuration()));
		if (configure) {
			Assert.assertEquals("SNAPPY", reader.getCompressionKind().toString());
		} else {
			Assert.assertEquals("ZLIB", reader.getCompressionKind().toString());
		}
	} catch (IOException e) {
		throw new RuntimeException(e);
	}
}
 
Example #4
Source File: OrcBulkWriterTestUtil.java    From flink with Apache License 2.0 6 votes vote down vote up
public static void validate(File files, List<Record> expected) throws IOException {
	final File[] buckets = files.listFiles();
	assertNotNull(buckets);
	assertEquals(1, buckets.length);

	final File[] partFiles = buckets[0].listFiles();
	assertNotNull(partFiles);

	for (File partFile : partFiles) {
		assertTrue(partFile.length() > 0);

		OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(new Configuration());
		Reader reader = OrcFile.createReader(new org.apache.hadoop.fs.Path(partFile.toURI()), readerOptions);

		assertEquals(3, reader.getNumberOfRows());
		assertEquals(2, reader.getSchema().getFieldNames().size());
		assertSame(reader.getCompressionKind(), CompressionKind.LZ4);
		assertTrue(reader.hasMetadataValue(USER_METADATA_KEY));
		assertTrue(reader.getMetadataKeys().contains(USER_METADATA_KEY));

		List<Record> results = getResults(reader);

		assertEquals(3, results.size());
		assertEquals(results, expected);
	}
}
 
Example #5
Source File: ORC.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public OrcIterator build() {
  Preconditions.checkNotNull(schema, "Schema is required");
  try {
    Path path = new Path(file.location());
    Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
    ColumnIdMap columnIds = new ColumnIdMap();
    TypeDescription orcSchema = TypeConversion.toOrc(schema, columnIds);
    Reader.Options options = reader.options();
    if (start != null) {
      options.range(start, length);
    }
    options.schema(orcSchema);
    return new OrcIterator(path, orcSchema, reader.rows(options));
  } catch (IOException e) {
    throw new RuntimeException("Can't open " + file.location(), e);
  }
}
 
Example #6
Source File: OrcFileAppender.java    From iceberg with Apache License 2.0 6 votes vote down vote up
OrcFileAppender(Schema schema,
                OutputFile file,
                OrcFile.WriterOptions options,
                Map<String,byte[]> metadata) {
  orcSchema = TypeConversion.toOrc(schema, columnIds);
  options.setSchema(orcSchema);
  path = new Path(file.location());
  try {
    writer = OrcFile.createWriter(path, options);
  } catch (IOException e) {
    throw new RuntimeException("Can't create file " + path, e);
  }
  writer.addUserMetadata(COLUMN_NUMBERS_ATTRIBUTE, columnIds.serialize());
  metadata.forEach(
      (key,value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value)));
}
 
Example #7
Source File: OrcCompactionTaskTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
/**
 * Read a output ORC compacted file into memory.
 * This only works if fields are int value.
 */
public List<OrcStruct> readOrcFile(Path orcFilePath)
    throws IOException, InterruptedException {
  ReaderImpl orcReader = new ReaderImpl(orcFilePath, new OrcFile.ReaderOptions(new Configuration()));

  Reader.Options options = new Reader.Options().schema(orcReader.getSchema());
  OrcMapreduceRecordReader recordReader = new OrcMapreduceRecordReader(orcReader, options);
  List<OrcStruct> result = new ArrayList<>();

  OrcStruct recordContainer;
  while (recordReader.nextKeyValue()) {
    recordContainer = (OrcStruct) OrcUtils.createValueRecursively(orcReader.getSchema());
    OrcUtils.upConvertOrcStruct((OrcStruct) recordReader.getCurrentValue(), recordContainer, orcReader.getSchema());
    result.add(recordContainer);
  }

  return result;
}
 
Example #8
Source File: TestMetricsRowGroupFilterTypes.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public void createOrcInputFile(List<Record> records) throws IOException {
  if (ORC_FILE.exists()) {
    Assert.assertTrue(ORC_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(ORC_FILE);
  try (FileAppender<Record> appender = ORC.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    appender.addAll(records);
  }

  InputFile inFile = Files.localInput(ORC_FILE);
  try (Reader reader = OrcFile.createReader(new Path(inFile.location()),
      OrcFile.readerOptions(new Configuration()))) {
    Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size());
  }

  ORC_FILE.deleteOnExit();
}
 
Example #9
Source File: JsonORCFileReaderWriterFactory.java    From secor with Apache License 2.0 6 votes vote down vote up
public JsonORCFileWriter(LogFilePath logFilePath, CompressionCodec codec)
        throws IOException {
    Configuration conf = new Configuration();
    Path path = new Path(logFilePath.getLogFilePath());
    schema = schemaProvider.getSchema(logFilePath.getTopic(),
            logFilePath);
    if (schema == null) {
        String topic = logFilePath.getTopic();
        throw new IllegalArgumentException(
            String.format("No schema is provided for topic '%s'", topic));
    }
    List<TypeDescription> fieldTypes = schema.getChildren();
    converters = new JsonConverter[fieldTypes.size()];
    for (int c = 0; c < converters.length; ++c) {
        converters[c] = VectorColumnFiller.createConverter(fieldTypes
                .get(c));
    }

    writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf)
            .compress(resolveCompression(codec)).setSchema(schema));
    batch = schema.createRowBatch();
}
 
Example #10
Source File: PentahoOrcRecordReader.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
static Reader getReader( String fileName, Configuration conf ) {

    try {
      S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( fileName, conf );
      Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( fileName ) );
      FileSystem fs = FileSystem.get( filePath.toUri(), conf );
      if ( !fs.exists( filePath ) ) {
        throw new NoSuchFileException( fileName );
      }
      if ( fs.getFileStatus( filePath ).isDirectory() ) {
        PathFilter pathFilter = file -> file.getName().endsWith( ".orc" );

        FileStatus[] fileStatuses = fs.listStatus( filePath, pathFilter );
        if ( fileStatuses.length == 0 ) {
          throw new NoSuchFileException( fileName );
        }
        filePath = fileStatuses[ 0 ].getPath();
      }
      return OrcFile.createReader( filePath,
        OrcFile.readerOptions( conf ).filesystem( fs ) );
    } catch ( IOException e ) {
      throw new IllegalArgumentException( "Unable to read data from file " + fileName, e );
    }
  }
 
Example #11
Source File: OrcFileAppender.java    From iceberg with Apache License 2.0 6 votes vote down vote up
OrcFileAppender(Schema schema, OutputFile file,
                Function<TypeDescription, OrcValueWriter<?>> createWriterFunc,
                Configuration conf, Map<String, byte[]> metadata,
                int batchSize) {
  this.conf = conf;
  this.file = file;
  this.batchSize = batchSize;
  this.schema = schema;

  TypeDescription orcSchema = ORCSchemaUtil.convert(this.schema);
  this.batch = orcSchema.createRowBatch(this.batchSize);

  OrcFile.WriterOptions options = OrcFile.writerOptions(conf).useUTCTimestamp(true);
  if (file instanceof HadoopOutputFile) {
    options.fileSystem(((HadoopOutputFile) file).getFileSystem());
  }
  options.setSchema(orcSchema);
  this.writer = newOrcWriter(file, options, metadata);
  this.valueWriter = newOrcValueWriter(orcSchema, createWriterFunc);
}
 
Example #12
Source File: OrcCompactionTaskTest.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
public void writeOrcRecordsInFile(Path path, TypeDescription schema, List<OrcStruct> orcStructs) throws Exception {
  Configuration configuration = new Configuration();
  OrcFile.WriterOptions options = OrcFile.writerOptions(configuration).setSchema(schema);

  Writer writer = OrcFile.createWriter(path, options);
  OrcMapreduceRecordWriter recordWriter = new OrcMapreduceRecordWriter(writer);
  for (OrcStruct orcRecord : orcStructs) {
    recordWriter.write(NullWritable.get(), orcRecord);
  }
  recordWriter.close(new TaskAttemptContextImpl(configuration, new TaskAttemptID()));
}
 
Example #13
Source File: JsonORCFileReaderWriterFactory.java    From secor with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
public JsonORCFileReader(LogFilePath logFilePath, CompressionCodec codec)
        throws IOException {
    schema = schemaProvider.getSchema(logFilePath.getTopic(),
            logFilePath);
    Path path = new Path(logFilePath.getLogFilePath());
    Reader reader = OrcFile.createReader(path,
            OrcFile.readerOptions(new Configuration(true)));
    offset = logFilePath.getOffset();
    rows = reader.rows();
    batch = reader.getSchema().createRowBatch();
    rows.nextBatch(batch);
}
 
Example #14
Source File: OrcKeyCompactorOutputFormat.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * Required for extension since super method hard-coded file extension as ".orc". To keep flexibility
 * of extension name, we made it configuration driven.
 * @param taskAttemptContext The source of configuration that determines the file extension
 * @return The {@link RecordWriter} that write out Orc object.
 * @throws IOException
 */
@Override
public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException {
  Configuration conf = taskAttemptContext.getConfiguration();
  String extension = "." + conf.get(COMPACTION_OUTPUT_EXTENSION, "orc" );

  Path filename = getDefaultWorkFile(taskAttemptContext, extension);
  Writer writer = OrcFile.createWriter(filename,
      org.apache.orc.mapred.OrcOutputFormat.buildOptions(conf));
  return new OrcMapreduceRecordWriter(writer);
}
 
Example #15
Source File: OrcNoHiveShim.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public RecordReader createRecordReader(
		Configuration conf,
		TypeDescription schema,
		int[] selectedFields,
		List<OrcSplitReader.Predicate> conjunctPredicates,
		org.apache.flink.core.fs.Path path,
		long splitStart,
		long splitLength) throws IOException {
	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(path.toUri());

	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(
			splitStart, splitLength, orcReader.getStripes());

	// create ORC row reader configuration
	Reader.Options options = new Reader.Options()
			.schema(schema)
			.range(offsetAndLength.f0, offsetAndLength.f1)
			.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
			.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
			.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// TODO configure filters

	// configure selected fields
	options.include(computeProjectionMask(schema, selectedFields));

	// create ORC row reader
	RecordReader orcRowsReader = orcReader.rows(options);

	// assign ids
	schema.getId();

	return orcRowsReader;
}
 
Example #16
Source File: OrcBulkWriterFactory.java    From flink with Apache License 2.0 5 votes vote down vote up
private OrcFile.WriterOptions getWriterOptions() {
	if (null == writerOptions) {
		Configuration conf = new Configuration();
		for (Map.Entry<String, String> entry : confMap.entrySet()) {
			conf.set(entry.getKey(), entry.getValue());
		}

		writerOptions = OrcFile.writerOptions(writerProperties, conf);
		writerOptions.setSchema(this.vectorizer.getSchema());
	}

	return writerOptions;
}
 
Example #17
Source File: OrcBulkWriterFactory.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public BulkWriter<T> create(FSDataOutputStream out) throws IOException {
	OrcFile.WriterOptions opts = getWriterOptions();
	opts.physicalWriter(new PhysicalWriterImpl(out, opts));

	return new OrcBulkWriter<>(vectorizer, new WriterImpl(null, FIXED_PATH, opts));
}
 
Example #18
Source File: OrcFileAppender.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Writer newOrcWriter(OutputFile file,
                                   OrcFile.WriterOptions options, Map<String, byte[]> metadata) {
  final Path locPath = new Path(file.location());
  final Writer writer;

  try {
    writer = OrcFile.createWriter(locPath, options);
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Can't create file " + locPath);
  }

  metadata.forEach((key, value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value)));

  return writer;
}
 
Example #19
Source File: AvroToOrcRecordConverter.java    From datacollector with Apache License 2.0 5 votes vote down vote up
public static Writer createOrcWriter(Properties orcWriterProperties, Configuration configuration, Path orcOutputFile, TypeDescription orcSchema) throws IOException {
  if (LOG.isDebugEnabled()) {
    LOG.debug("Creating ORC writer at: {}", orcOutputFile.toString());
  }
  return OrcFile.createWriter(
      orcOutputFile,
      OrcFile.writerOptions(orcWriterProperties, configuration).setSchema(orcSchema)
  );
}
 
Example #20
Source File: OrcToSdcRecordConverter.java    From datacollector with Apache License 2.0 5 votes vote down vote up
public OrcToSdcRecordConverter(Path orcFilePath) throws IOException {
  final Configuration readerConf = new Configuration();
  final OrcFile.ReaderOptions fileReaderOptions = OrcFile.readerOptions(readerConf);
  this.orcFilePath = orcFilePath;
  reader = OrcFile.createReader(this.orcFilePath, fileReaderOptions);

  // TODO: support various parameters to Reader.Options via options passed into constructor?
  // final Reader.Options rowReaderOptions = new Reader.Options();

  // for now, just use default options
  rows = reader.rows();
  readerBatch = reader.getSchema().createRowBatch();
}
 
Example #21
Source File: SqlInterpreterTest.java    From zeppelin with Apache License 2.0 5 votes vote down vote up
public File createORCFile(int[] values) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".orc");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();
  conf.set("orc.compress", "snappy");
  TypeDescription schema = TypeDescription.fromString("struct<msg:int>");
  Writer writer = OrcFile.createWriter(path,
          OrcFile.writerOptions(conf)
                  .setSchema(schema));
  VectorizedRowBatch batch = schema.createRowBatch();
  LongColumnVector x = (LongColumnVector) batch.cols[0];
  for (int i = 0; i < values.length; ++i) {
    int row = batch.size++;
    x.vector[row] = values[i];
    // If the batch is full, write it out and start over.
    if (batch.size == batch.getMaxSize()) {
      writer.addRowBatch(batch);
      batch.reset();
    }
  }
  if (batch.size != 0) {
    writer.addRowBatch(batch);
    batch.reset();
  }
  writer.close();
  return file;
}
 
Example #22
Source File: OrcWriter.java    From osm2orc with ISC License 5 votes vote down vote up
@Override
public void initialize(Map<String, Object> metaData) {
    try {
        Configuration conf = new Configuration();
        // conf.set(OrcConf.BLOOM_FILTER_COLUMNS.getAttribute(), "tags");
        processor = new OrcEntityProcessor(OrcFile.createWriter(new Path(filename),
                OrcFile.writerOptions(conf).setSchema(SCHEMA)), SCHEMA.createRowBatch());
    } catch (IOException e) {
        throw new OsmosisRuntimeException(e);
    }
}
 
Example #23
Source File: DremioORCRecordUtils.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
static boolean hadBadBloomFilters(TypeDescription.Category category,
                                  OrcFile.WriterVersion version) {
  switch(category) {
    case STRING:
    case CHAR:
    case VARCHAR:
      return !version.includes(OrcFile.WriterVersion.HIVE_12055);
    case DECIMAL:
      return true;
    case TIMESTAMP:
      return !version.includes(OrcFile.WriterVersion.ORC_135);
    default:
      return false;
  }
}
 
Example #24
Source File: OrcNoHiveBulkWriterFactory.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public BulkWriter<RowData> create(FSDataOutputStream out) throws IOException {
	OrcFile.WriterOptions opts = OrcFile.writerOptions(new Properties(), conf);
	TypeDescription description = TypeDescription.fromString(schema);
	opts.setSchema(description);
	opts.physicalWriter(new PhysicalWriterImpl(out, opts));
	WriterImpl writer = new WriterImpl(null, new Path("."), opts);

	VectorizedRowBatch rowBatch = description.createRowBatch();
	return new BulkWriter<RowData>() {
		@Override
		public void addElement(RowData row) throws IOException {
			int rowId = rowBatch.size++;
			for (int i = 0; i < row.getArity(); ++i) {
				setColumn(rowId, rowBatch.cols[i], fieldTypes[i], row, i);
			}
			if (rowBatch.size == rowBatch.getMaxSize()) {
				writer.addRowBatch(rowBatch);
				rowBatch.reset();
			}
		}

		@Override
		public void flush() throws IOException {
			if (rowBatch.size != 0) {
				writer.addRowBatch(rowBatch);
				rowBatch.reset();
			}
		}

		@Override
		public void finish() throws IOException {
			flush();
			writer.close();
		}
	};
}
 
Example #25
Source File: ORC.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static Reader newFileReader(String location, ReaderOptions readerOptions) {
  try {
    return OrcFile.createReader(new Path(location), readerOptions);
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to open file: %s", location);
  }
}
 
Example #26
Source File: ORC.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static Reader newFileReader(InputFile file, Configuration config) {
  ReaderOptions readerOptions = OrcFile.readerOptions(config).useUTCTimestamp(true);
  if (file instanceof HadoopInputFile) {
    readerOptions.filesystem(((HadoopInputFile) file).getFileSystem());
  }
  return newFileReader(file.location(), readerOptions);
}
 
Example #27
Source File: TestOrcMetadata.java    From rainbow with Apache License 2.0 5 votes vote down vote up
@Test
public void test () throws IOException, Descriptors.DescriptorValidationException
{
    Configuration conf = new Configuration();
    System.setProperty("hadoop.home.dir", "/");
    FileSystem fileSystem = FileSystem.get(URI.create("hdfs://presto00:9000"), conf);
    Path hdfsDirPath = new Path("/rainbow2/orc_new_compress");
    System.out.println(fileSystem.isFile(hdfsDirPath));
    FileStatus[] fileStatuses = fileSystem.listStatus(hdfsDirPath);
    System.out.println(fileStatuses.length);
    for (FileStatus status : fileStatuses)
    {
        status.getPath();
        System.out.println(status.getPath() + ", " + status.getLen());
    }

    Reader reader = OrcFile.createReader(fileStatuses[0].getPath(),
            OrcFile.readerOptions(conf));
    System.out.println("file length:" + reader.getFileTail().getFileLength());
    List<String> columnNames = new ArrayList<>();
    columnNames.add("samplepercent");
    System.out.println(reader.getRawDataSizeOfColumns(columnNames));
    System.out.println(reader.getFileTail().getFooter().getTypes(0).getFieldNames(0));
    System.out.println(reader.getTypes().get(0).getSerializedSize());

    List<Reader> readers = new ArrayList<>();
    for (FileStatus fileStatus : fileStatuses)
    {
        Reader reader1 = OrcFile.createReader(fileStatus.getPath(),
                OrcFile.readerOptions(conf));
        readers.add(reader1);
        System.out.println("content size: " + reader1.getContentLength() + ", raw size: "
        + reader1.getRawDataSize());
    }

    for (String columnName : reader.getSchema().getFieldNames())
    {
        System.out.println(columnName);
    }
}
 
Example #28
Source File: TestMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public void createOrcInputFile() throws IOException {
  if (orcFile.exists()) {
    Assert.assertTrue(orcFile.delete());
  }

  OutputFile outFile = Files.localOutput(orcFile);
  try (FileAppender<GenericRecord> appender = ORC.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    GenericRecord record = GenericRecord.create(FILE_SCHEMA);
    // create 50 records
    for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) {
      record.setField("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0
      record.setField("_no_stats_parquet", TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats
                                                                        // in Parquet, but will produce stats for ORC
      record.setField("_required", "req"); // required, always non-null
      record.setField("_all_nulls", null); // never non-null
      record.setField("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
      record.setField("_no_nulls", ""); // optional, but always non-null
      record.setField("_str", i + "str" + i);

      GenericRecord structNotNull = GenericRecord.create(_structFieldType);
      structNotNull.setField("_int_field", INT_MIN_VALUE + i);
      record.setField("_struct_not_null", structNotNull); // struct with int

      appender.add(record);
    }
  }

  InputFile inFile = Files.localInput(orcFile);
  try (Reader reader = OrcFile.createReader(new Path(inFile.location()),
      OrcFile.readerOptions(new Configuration()))) {
    Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size());
  }

  orcFile.deleteOnExit();
}
 
Example #29
Source File: OrcColumnarRowSplitReaderTest.java    From flink with Apache License 2.0 4 votes vote down vote up
protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
	// NOTE: orc has field name information, so name should be same as orc
	TypeDescription schema =
			TypeDescription.fromString(
					"struct<" +
							"f0:float," +
							"f1:double," +
							"f2:timestamp," +
							"f3:tinyint," +
							"f4:smallint" +
							">");

	org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
	Configuration conf = new Configuration();

	Writer writer =
			OrcFile.createWriter(filePath,
					OrcFile.writerOptions(conf).setSchema(schema));

	VectorizedRowBatch batch = schema.createRowBatch(rowSize);
	DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
	DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
	TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
	LongColumnVector col3 = (LongColumnVector) batch.cols[3];
	LongColumnVector col4 = (LongColumnVector) batch.cols[4];

	col0.noNulls = false;
	col1.noNulls = false;
	col2.noNulls = false;
	col3.noNulls = false;
	col4.noNulls = false;
	for (int i = 0; i < rowSize - 1; i++) {
		col0.vector[i] = i;
		col1.vector[i] = i;

		Timestamp timestamp = toTimestamp(i);
		col2.time[i] = timestamp.getTime();
		col2.nanos[i] = timestamp.getNanos();

		col3.vector[i] = i;
		col4.vector[i] = i;
	}

	col0.isNull[rowSize - 1] = true;
	col1.isNull[rowSize - 1] = true;
	col2.isNull[rowSize - 1] = true;
	col3.isNull[rowSize - 1] = true;
	col4.isNull[rowSize - 1] = true;

	batch.size = rowSize;
	writer.addRowBatch(batch);
	batch.reset();
	writer.close();
}
 
Example #30
Source File: OrcRowInputFormat.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
public void open(FileInputSplit fileSplit) throws IOException {

	LOG.debug("Opening ORC file {}", fileSplit.getPath());

	// open ORC file and create reader
	org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(fileSplit.getPath().getPath());
	Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));

	// get offset and length for the stripes that start in the split
	Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(fileSplit, getStripes(orcReader));

	// create ORC row reader configuration
	Reader.Options options = getOptions(orcReader)
		.schema(schema)
		.range(offsetAndLength.f0, offsetAndLength.f1)
		.useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf))
		.skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf))
		.tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));

	// configure filters
	if (!conjunctPredicates.isEmpty()) {
		SearchArgument.Builder b = SearchArgumentFactory.newBuilder();
		b = b.startAnd();
		for (Predicate predicate : conjunctPredicates) {
			predicate.add(b);
		}
		b = b.end();
		options.searchArgument(b.build(), new String[]{});
	}

	// configure selected fields
	options.include(computeProjectionMask());

	// create ORC row reader
	this.orcRowsReader = orcReader.rows(options);

	// assign ids
	this.schema.getId();
	// create row batch
	this.rowBatch = schema.createRowBatch(batchSize);
	rowsInBatch = 0;
	nextRow = 0;
}