Java Code Examples for org.apache.iceberg.io.InputFile

The following examples show how to use org.apache.iceberg.io.InputFile. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: iceberg   Source File: OrcIterable.java    License: Apache License 2.0 6 votes vote down vote up
private static VectorizedRowBatchIterator newOrcIterator(InputFile file,
                                                         TypeDescription readerSchema,
                                                         Long start, Long length,
                                                         Reader orcFileReader, SearchArgument sarg) {
  final Reader.Options options = orcFileReader.options();
  if (start != null) {
    options.range(start, length);
  }
  options.schema(readerSchema);
  options.searchArgument(sarg, new String[]{});

  try {
    return new VectorizedRowBatchIterator(file.location(), readerSchema, orcFileReader.rows(options));
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to get ORC rows for file: %s", file);
  }
}
 
Example 2
Source Project: iceberg   Source File: TestManifestListVersions.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testV1ForwardCompatibility() throws IOException {
  InputFile manifestList = writeManifestList(TEST_MANIFEST, 1);
  GenericData.Record generic = readGeneric(manifestList, V1Metadata.MANIFEST_LIST_SCHEMA);

  // v1 metadata should match even though order changed
  Assert.assertEquals("Path", PATH, generic.get("manifest_path").toString());
  Assert.assertEquals("Length", LENGTH, generic.get("manifest_length"));
  Assert.assertEquals("Spec id", SPEC_ID, generic.get("partition_spec_id"));
  Assert.assertEquals("Snapshot id", SNAPSHOT_ID, (long) generic.get("added_snapshot_id"));
  Assert.assertEquals("Added files count", ADDED_FILES, (int) generic.get("added_data_files_count"));
  Assert.assertEquals("Existing files count", EXISTING_FILES, (int) generic.get("existing_data_files_count"));
  Assert.assertEquals("Deleted files count", DELETED_FILES, (int) generic.get("deleted_data_files_count"));
  Assert.assertEquals("Added rows count", ADDED_ROWS, (long) generic.get("added_rows_count"));
  Assert.assertEquals("Existing rows count", EXISTING_ROWS, (long) generic.get("existing_rows_count"));
  Assert.assertEquals("Deleted rows count", DELETED_ROWS, (long) generic.get("deleted_rows_count"));
  Assert.assertNull("Content", generic.get(ManifestFile.MANIFEST_CONTENT.name()));
  Assert.assertNull("Sequence number", generic.get(ManifestFile.SEQUENCE_NUMBER.name()));
  Assert.assertNull("Min sequence number", generic.get(ManifestFile.MIN_SEQUENCE_NUMBER.name()));
}
 
Example 3
Source Project: iceberg   Source File: TestMetrics.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testMetricsForNullColumns() throws IOException {
  Schema schema = new Schema(
      optional(1, "intCol", IntegerType.get())
  );
  Record firstRecord = GenericRecord.create(schema);
  firstRecord.setField("intCol", null);
  Record secondRecord = GenericRecord.create(schema);
  secondRecord.setField("intCol", null);

  InputFile recordsFile = writeRecords(schema, firstRecord, secondRecord);

  Metrics metrics = getMetrics(recordsFile);
  Assert.assertEquals(2L, (long) metrics.recordCount());
  assertCounts(1, 2L, 2L, metrics);
  assertBounds(1, IntegerType.get(), null, null, metrics);
}
 
Example 4
Source Project: iceberg   Source File: TestManifestListVersions.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testV2ForwardCompatibility() throws IOException {
  // v2 manifest list files can be read by v1 readers, but the sequence numbers and content will be ignored.
  InputFile manifestList = writeManifestList(TEST_MANIFEST, 2);
  GenericData.Record generic = readGeneric(manifestList, V1Metadata.MANIFEST_LIST_SCHEMA);

  // v1 metadata should match even though order changed
  Assert.assertEquals("Path", PATH, generic.get("manifest_path").toString());
  Assert.assertEquals("Length", LENGTH, generic.get("manifest_length"));
  Assert.assertEquals("Spec id", SPEC_ID, generic.get("partition_spec_id"));
  Assert.assertEquals("Snapshot id", SNAPSHOT_ID, (long) generic.get("added_snapshot_id"));
  Assert.assertEquals("Added files count", ADDED_FILES, (int) generic.get("added_data_files_count"));
  Assert.assertEquals("Existing files count", EXISTING_FILES, (int) generic.get("existing_data_files_count"));
  Assert.assertEquals("Deleted files count", DELETED_FILES, (int) generic.get("deleted_data_files_count"));
  Assert.assertEquals("Added rows count", ADDED_ROWS, (long) generic.get("added_rows_count"));
  Assert.assertEquals("Existing rows count", EXISTING_ROWS, (long) generic.get("existing_rows_count"));
  Assert.assertEquals("Deleted rows count", DELETED_ROWS, (long) generic.get("deleted_rows_count"));
  Assert.assertNull("Content", generic.get(ManifestFile.MANIFEST_CONTENT.name()));
  Assert.assertNull("Sequence number", generic.get(ManifestFile.SEQUENCE_NUMBER.name()));
  Assert.assertNull("Min sequence number", generic.get(ManifestFile.MIN_SEQUENCE_NUMBER.name()));
}
 
Example 5
Source Project: iceberg   Source File: ManifestLists.java    License: Apache License 2.0 6 votes vote down vote up
static List<ManifestFile> read(InputFile manifestList) {
  try (CloseableIterable<ManifestFile> files = Avro.read(manifestList)
      .rename("manifest_file", GenericManifestFile.class.getName())
      .rename("partitions", GenericPartitionFieldSummary.class.getName())
      .rename("r508", GenericPartitionFieldSummary.class.getName())
      .classLoader(GenericManifestFile.class.getClassLoader())
      .project(ManifestFile.schema())
      .reuseContainers(false)
      .build()) {

    return Lists.newLinkedList(files);

  } catch (IOException e) {
    throw new RuntimeIOException(e, "Cannot read manifest list file: %s", manifestList.location());
  }
}
 
Example 6
Source Project: iceberg   Source File: TestMetricsRowGroupFilterTypes.java    License: Apache License 2.0 6 votes vote down vote up
public void createOrcInputFile(List<Record> records) throws IOException {
  if (ORC_FILE.exists()) {
    Assert.assertTrue(ORC_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(ORC_FILE);
  try (FileAppender<Record> appender = ORC.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    appender.addAll(records);
  }

  InputFile inFile = Files.localInput(ORC_FILE);
  try (Reader reader = OrcFile.createReader(new Path(inFile.location()),
      OrcFile.readerOptions(new Configuration()))) {
    Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size());
  }

  ORC_FILE.deleteOnExit();
}
 
Example 7
Source Project: iceberg   Source File: TestMetricsRowGroupFilterTypes.java    License: Apache License 2.0 6 votes vote down vote up
public void createParquetInputFile(List<Record> records) throws IOException {
  if (PARQUET_FILE.exists()) {
    Assert.assertTrue(PARQUET_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(PARQUET_FILE);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericParquetWriter::buildWriter)
      .build()) {
    appender.addAll(records);
  }

  InputFile inFile = Files.localInput(PARQUET_FILE);
  try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) {
    Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
    rowGroupMetadata = reader.getRowGroups().get(0);
    parquetSchema = reader.getFileMetaData().getSchema();
  }

  PARQUET_FILE.deleteOnExit();
}
 
Example 8
Source Project: iceberg   Source File: TestMetricsRowGroupFilterTypes.java    License: Apache License 2.0 6 votes vote down vote up
private org.apache.parquet.io.InputFile parquetInputFile(InputFile inFile) {
  return new org.apache.parquet.io.InputFile() {
    @Override
    public long getLength() throws IOException {
      return inFile.getLength();
    }

    @Override
    public org.apache.parquet.io.SeekableInputStream newStream() throws IOException {
      SeekableInputStream stream = inFile.newStream();
      return new DelegatingSeekableInputStream(stream) {
        @Override
        public long getPos() throws IOException {
          return stream.getPos();
        }

        @Override
        public void seek(long newPos) throws IOException {
          stream.seek(newPos);
        }
      };
    }
  };
}
 
Example 9
Source Project: iceberg   Source File: IcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> open(FileScanTask currentTask, Schema readSchema) {
  DataFile file = currentTask.file();
  // TODO we should make use of FileIO to create inputFile
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());
  CloseableIterable<T> iterable;
  switch (file.format()) {
    case AVRO:
      iterable = newAvroIterable(inputFile, currentTask, readSchema);
      break;
    case ORC:
      iterable = newOrcIterable(inputFile, currentTask, readSchema);
      break;
    case PARQUET:
      iterable = newParquetIterable(inputFile, currentTask, readSchema);
      break;
    default:
      throw new UnsupportedOperationException(
          String.format("Cannot read %s file: %s", file.format().name(), file.path()));
  }

  return iterable;
}
 
Example 10
Source Project: iceberg   Source File: IcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> newAvroIterable(
    InputFile inputFile, FileScanTask task, Schema readSchema) {
  Avro.ReadBuilder avroReadBuilder = Avro.read(inputFile)
      .project(readSchema)
      .split(task.start(), task.length());
  if (reuseContainers) {
    avroReadBuilder.reuseContainers();
  }

  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO implement value readers for Pig and Hive
      throw new UnsupportedOperationException("Avro support not yet supported for Pig and Hive");
    case GENERIC:
      avroReadBuilder.createReaderFunc(
          (expIcebergSchema, expAvroSchema) ->
              DataReader.create(expIcebergSchema, expAvroSchema,
                  constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }
  return applyResidualFiltering(avroReadBuilder.build(), task.residual(), readSchema);
}
 
Example 11
Source Project: iceberg   Source File: IcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> newParquetIterable(InputFile inputFile, FileScanTask task, Schema readSchema) {
  Parquet.ReadBuilder parquetReadBuilder = Parquet.read(inputFile)
      .project(readSchema)
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .split(task.start(), task.length());
  if (reuseContainers) {
    parquetReadBuilder.reuseContainers();
  }

  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO implement value readers for Pig and Hive
      throw new UnsupportedOperationException("Parquet support not yet supported for Pig and Hive");
    case GENERIC:
      parquetReadBuilder.createReaderFunc(
          fileSchema -> GenericParquetReaders.buildReader(
              readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }
  return applyResidualFiltering(parquetReadBuilder.build(), task.residual(), readSchema);
}
 
Example 12
Source Project: iceberg   Source File: IcebergInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> newOrcIterable(InputFile inputFile, FileScanTask task, Schema readSchema) {
  ORC.ReadBuilder orcReadBuilder = ORC.read(inputFile)
      .project(readSchema)
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .split(task.start(), task.length());
  // ORC does not support reuse containers yet
  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO: implement value readers for Pig and Hive
      throw new UnsupportedOperationException("ORC support not yet supported for Pig and Hive");
    case GENERIC:
      orcReadBuilder.createReaderFunc(
          fileSchema -> GenericOrcReader.buildReader(
              readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }

  return applyResidualFiltering(orcReadBuilder.build(), task.residual(), readSchema);
}
 
Example 13
Source Project: iceberg   Source File: RowDataReader.java    License: Apache License 2.0 6 votes vote down vote up
private CloseableIterable<InternalRow> newAvroIterable(
    InputFile location,
    FileScanTask task,
    Schema projection,
    Map<Integer, ?> idToConstant) {
  Avro.ReadBuilder builder = Avro.read(location)
      .reuseContainers()
      .project(projection)
      .split(task.start(), task.length())
      .createReaderFunc(readSchema -> new SparkAvroReader(projection, readSchema, idToConstant));

  if (nameMapping != null) {
    builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
  }

  return builder.build();
}
 
Example 14
Source Project: iceberg   Source File: RowDataReader.java    License: Apache License 2.0 6 votes vote down vote up
private CloseableIterable<InternalRow> newParquetIterable(
    InputFile location,
    FileScanTask task,
    Schema readSchema,
    Map<Integer, ?> idToConstant) {
  Parquet.ReadBuilder builder = Parquet.read(location)
      .split(task.start(), task.length())
      .project(readSchema)
      .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant))
      .filter(task.residual())
      .caseSensitive(caseSensitive);

  if (nameMapping != null) {
    builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
  }

  return builder.build();
}
 
Example 15
Source Project: iceberg   Source File: GenericManifestFile.java    License: Apache License 2.0 6 votes vote down vote up
GenericManifestFile(InputFile file, int specId) {
  this.avroSchema = AVRO_SCHEMA;
  this.file = file;
  this.manifestPath = file.location();
  this.length = null; // lazily loaded from file
  this.specId = specId;
  this.sequenceNumber = 0;
  this.minSequenceNumber = 0;
  this.snapshotId = null;
  this.addedFilesCount = null;
  this.addedRowsCount = null;
  this.existingFilesCount = null;
  this.existingRowsCount = null;
  this.deletedFilesCount = null;
  this.deletedRowsCount = null;
  this.partitions = null;
  this.fromProjectionPos = null;
}
 
Example 16
Source Project: iceberg   Source File: TableMetadata.java    License: Apache License 2.0 6 votes vote down vote up
private List<MetadataLogEntry> addPreviousFile(InputFile previousFile, long timestampMillis,
                                               Map<String, String> updatedProperties) {
  if (previousFile == null) {
    return previousFiles;
  }

  int maxSize = Math.max(1, PropertyUtil.propertyAsInt(updatedProperties,
          TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, TableProperties.METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT));

  List<MetadataLogEntry> newMetadataLog;
  if (previousFiles.size() >= maxSize) {
    int removeIndex = previousFiles.size() - maxSize + 1;
    newMetadataLog = Lists.newArrayList(previousFiles.subList(removeIndex, previousFiles.size()));
  } else {
    newMetadataLog = Lists.newArrayList(previousFiles);
  }
  newMetadataLog.add(new MetadataLogEntry(timestampMillis, previousFile.location()));

  return newMetadataLog;
}
 
Example 17
Source Project: iceberg   Source File: OrcMetrics.java    License: Apache License 2.0 5 votes vote down vote up
static Metrics fromInputFile(InputFile file, Configuration config) {
  try (Reader orcReader = ORC.newFileReader(file, config)) {
    return buildOrcMetrics(orcReader.getNumberOfRows(), orcReader.getSchema(), orcReader.getStatistics());
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to open file: %s", file.location());
  }
}
 
Example 18
Source Project: iceberg   Source File: OrcIterable.java    License: Apache License 2.0 5 votes vote down vote up
OrcIterable(InputFile file, Configuration config, Schema schema,
            Long start, Long length,
            Function<TypeDescription, OrcRowReader<?>> readerFunction, boolean caseSensitive, Expression filter) {
  this.schema = schema;
  this.readerFunction = readerFunction;
  this.file = file;
  this.start = start;
  this.length = length;
  this.config = config;
  this.caseSensitive = caseSensitive;
  this.filter = (filter == Expressions.alwaysTrue()) ? null : filter;
}
 
Example 19
Source Project: iceberg   Source File: ORC.java    License: Apache License 2.0 5 votes vote down vote up
private ReadBuilder(InputFile file) {
  Preconditions.checkNotNull(file, "Input file cannot be null");
  this.file = file;
  if (file instanceof HadoopInputFile) {
    this.conf = new Configuration(((HadoopInputFile) file).getConf());
  } else {
    this.conf = new Configuration();
  }
}
 
Example 20
Source Project: iceberg   Source File: ORC.java    License: Apache License 2.0 5 votes vote down vote up
static Reader newFileReader(InputFile file, Configuration config) {
  ReaderOptions readerOptions = OrcFile.readerOptions(config).useUTCTimestamp(true);
  if (file instanceof HadoopInputFile) {
    readerOptions.filesystem(((HadoopInputFile) file).getFileSystem());
  }
  return newFileReader(file.location(), readerOptions);
}
 
Example 21
Source Project: iceberg   Source File: TestMetrics.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testMetricsForNestedStructFieldsWithMultipleRowGroup() throws IOException {
  Assume.assumeTrue("Skip test for formats that do not support small row groups", supportsSmallRowGroups());

  int recordCount = 201;
  List<Record> records = Lists.newArrayListWithExpectedSize(recordCount);

  for (int i = 0; i < recordCount; i++) {
    Record newLeafStruct = GenericRecord.create(LEAF_STRUCT_TYPE);
    newLeafStruct.setField("leafLongCol", i + 1L);
    newLeafStruct.setField("leafBinaryCol", ByteBuffer.wrap("A".getBytes()));
    Record newNestedStruct = GenericRecord.create(NESTED_STRUCT_TYPE);
    newNestedStruct.setField("longCol", i + 1L);
    newNestedStruct.setField("leafStructCol", newLeafStruct);
    Record newRecord = GenericRecord.create(NESTED_SCHEMA);
    newRecord.setField("intCol", i + 1);
    newRecord.setField("nestedStructCol", newNestedStruct);
    records.add(newRecord);
  }

  // create file with multiple row groups. by using smaller number of bytes
  InputFile recordsFile = writeRecordsWithSmallRowGroups(NESTED_SCHEMA, records.toArray(new Record[0]));

  Assert.assertNotNull(recordsFile);
  // rowgroup size should be > 1
  Assert.assertEquals(3, splitCount(recordsFile));

  Metrics metrics = getMetrics(recordsFile);
  Assert.assertEquals(201L, (long) metrics.recordCount());
  assertCounts(1, 201L, 0L, metrics);
  assertBounds(1, IntegerType.get(), 1, 201, metrics);
  assertCounts(3, 201L, 0L, metrics);
  assertBounds(3, LongType.get(), 1L, 201L, metrics);
  assertCounts(5, 201L, 0L, metrics);
  assertBounds(5, LongType.get(), 1L, 201L, metrics);
  assertCounts(6, 201L, 0L, metrics);
  assertBounds(6, BinaryType.get(),
      ByteBuffer.wrap("A".getBytes()), ByteBuffer.wrap("A".getBytes()), metrics);
}
 
Example 22
Source Project: iceberg   Source File: TestManifestWriterVersions.java    License: Apache License 2.0 5 votes vote down vote up
private InputFile writeManifestList(ManifestFile manifest, int formatVersion) throws IOException {
  OutputFile manifestList = Files.localOutput(temp.newFile());
  try (FileAppender<ManifestFile> writer = ManifestLists.write(
      formatVersion, manifestList, SNAPSHOT_ID, SNAPSHOT_ID - 1, formatVersion > 1 ? SEQUENCE_NUMBER : 0)) {
    writer.add(manifest);
  }
  return manifestList.toInputFile();
}
 
Example 23
Source Project: iceberg   Source File: TestMetrics.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testMetricsForNestedStructFields() throws IOException {

  Record leafStruct = GenericRecord.create(LEAF_STRUCT_TYPE);
  leafStruct.setField("leafLongCol", 20L);
  leafStruct.setField("leafBinaryCol", ByteBuffer.wrap("A".getBytes()));
  Record nestedStruct = GenericRecord.create(NESTED_STRUCT_TYPE);
  nestedStruct.setField("longCol", 100L);
  nestedStruct.setField("leafStructCol", leafStruct);
  Record record = GenericRecord.create(NESTED_SCHEMA);
  record.setField("intCol", Integer.MAX_VALUE);
  record.setField("nestedStructCol", nestedStruct);

  InputFile recordsFile = writeRecords(NESTED_SCHEMA, record);

  Metrics metrics = getMetrics(recordsFile);
  Assert.assertEquals(1L, (long) metrics.recordCount());
  assertCounts(1, 1L, 0L, metrics);
  assertBounds(1, IntegerType.get(), Integer.MAX_VALUE, Integer.MAX_VALUE, metrics);
  assertCounts(3, 1L, 0L, metrics);
  assertBounds(3, LongType.get(), 100L, 100L, metrics);
  assertCounts(5, 1L, 0L, metrics);
  assertBounds(5, LongType.get(), 20L, 20L, metrics);
  assertCounts(6, 1L, 0L, metrics);
  assertBounds(6, BinaryType.get(),
      ByteBuffer.wrap("A".getBytes()), ByteBuffer.wrap("A".getBytes()), metrics);
}
 
Example 24
Source Project: iceberg   Source File: TestOrcMetrics.java    License: Apache License 2.0 5 votes vote down vote up
private InputFile writeRecords(Schema schema, Map<String, String> properties, Record... records) throws IOException {
  File tmpFolder = temp.newFolder("orc");
  String filename = UUID.randomUUID().toString();
  OutputFile file = Files.localOutput(new File(tmpFolder, FileFormat.ORC.addExtension(filename)));
  try (FileAppender<Record> writer = ORC.write(file)
      .schema(schema)
      .setAll(properties)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    writer.addAll(Lists.newArrayList(records));
  }
  return file.toInputFile();
}
 
Example 25
Source Project: iceberg   Source File: PlaintextEncryptionManager.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public InputFile decrypt(EncryptedInputFile encrypted) {
  if (encrypted.keyMetadata().buffer() != null) {
    LOG.warn("File encryption key metadata is present, but currently using PlaintextEncryptionManager.");
  }
  return encrypted.encryptedInputFile();
}
 
Example 26
Source Project: iceberg   Source File: TestMetricsRowGroupFilter.java    License: Apache License 2.0 5 votes vote down vote up
public void createOrcInputFile() throws IOException {
  if (orcFile.exists()) {
    Assert.assertTrue(orcFile.delete());
  }

  OutputFile outFile = Files.localOutput(orcFile);
  try (FileAppender<GenericRecord> appender = ORC.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    GenericRecord record = GenericRecord.create(FILE_SCHEMA);
    // create 50 records
    for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) {
      record.setField("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0
      record.setField("_no_stats_parquet", TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats
                                                                        // in Parquet, but will produce stats for ORC
      record.setField("_required", "req"); // required, always non-null
      record.setField("_all_nulls", null); // never non-null
      record.setField("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
      record.setField("_no_nulls", ""); // optional, but always non-null
      record.setField("_str", i + "str" + i);

      GenericRecord structNotNull = GenericRecord.create(_structFieldType);
      structNotNull.setField("_int_field", INT_MIN_VALUE + i);
      record.setField("_struct_not_null", structNotNull); // struct with int

      appender.add(record);
    }
  }

  InputFile inFile = Files.localInput(orcFile);
  try (Reader reader = OrcFile.createReader(new Path(inFile.location()),
      OrcFile.readerOptions(new Configuration()))) {
    Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size());
  }

  orcFile.deleteOnExit();
}
 
Example 27
Source Project: iceberg   Source File: DataFiles.java    License: Apache License 2.0 5 votes vote down vote up
public Builder withInputFile(InputFile file) {
  if (file instanceof HadoopInputFile) {
    return withStatus(((HadoopInputFile) file).getStat());
  }

  this.filePath = file.location();
  this.fileSizeInBytes = file.getLength();
  return this;
}
 
Example 28
Source Project: iceberg   Source File: ManifestFiles.java    License: Apache License 2.0 5 votes vote down vote up
static ManifestFile copyRewriteManifest(int formatVersion,
                                        InputFile toCopy, Map<Integer, PartitionSpec> specsById,
                                        OutputFile outputFile, long snapshotId,
                                        SnapshotSummary.Builder summaryBuilder) {
  // for a rewritten manifest all snapshot ids should be set. use empty metadata to throw an exception if it is not
  InheritableMetadata inheritableMetadata = InheritableMetadataFactory.empty();
  try (ManifestReader<DataFile> reader =
           new ManifestReader<>(toCopy, specsById, inheritableMetadata, FileType.DATA_FILES)) {
    return copyManifestInternal(
        formatVersion, reader, outputFile, snapshotId, summaryBuilder, ManifestEntry.Status.EXISTING);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to close manifest: %s", toCopy.location());
  }
}
 
Example 29
Source Project: iceberg   Source File: RowDataReader.java    License: Apache License 2.0 5 votes vote down vote up
private CloseableIterable<InternalRow> open(FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) {
  CloseableIterable<InternalRow> iter;
  if (task.isDataTask()) {
    iter = newDataIterable(task.asDataTask(), readSchema);
  } else {
    InputFile location = getInputFile(task);
    Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");

    switch (task.file().format()) {
      case PARQUET:
        iter = newParquetIterable(location, task, readSchema, idToConstant);
        break;

      case AVRO:
        iter = newAvroIterable(location, task, readSchema, idToConstant);
        break;

      case ORC:
        iter = newOrcIterable(location, task, readSchema, idToConstant);
        break;

      default:
        throw new UnsupportedOperationException(
            "Cannot read unknown format: " + task.file().format());
    }
  }

  return iter;
}
 
Example 30
Source Project: iceberg   Source File: RowDataReader.java    License: Apache License 2.0 5 votes vote down vote up
private CloseableIterable<InternalRow> newOrcIterable(
    InputFile location,
    FileScanTask task,
    Schema readSchema,
    Map<Integer, ?> idToConstant) {
  return ORC.read(location)
      .project(readSchema)
      .split(task.start(), task.length())
      .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant))
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .build();
}