org.apache.iceberg.io.InputFile Java Examples

The following examples show how to use org.apache.iceberg.io.InputFile. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RowDataReader.java    From iceberg with Apache License 2.0 7 votes vote down vote up
private CloseableIterable<InternalRow> newAvroIterable(
    InputFile location,
    FileScanTask task,
    Schema projection,
    Map<Integer, ?> idToConstant) {
  Avro.ReadBuilder builder = Avro.read(location)
      .reuseContainers()
      .project(projection)
      .split(task.start(), task.length())
      .createReaderFunc(readSchema -> new SparkAvroReader(projection, readSchema, idToConstant));

  if (nameMapping != null) {
    builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
  }

  return builder.build();
}
 
Example #2
Source File: TestMetricsRowGroupFilterTypes.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private org.apache.parquet.io.InputFile parquetInputFile(InputFile inFile) {
  return new org.apache.parquet.io.InputFile() {
    @Override
    public long getLength() throws IOException {
      return inFile.getLength();
    }

    @Override
    public org.apache.parquet.io.SeekableInputStream newStream() throws IOException {
      SeekableInputStream stream = inFile.newStream();
      return new DelegatingSeekableInputStream(stream) {
        @Override
        public long getPos() throws IOException {
          return stream.getPos();
        }

        @Override
        public void seek(long newPos) throws IOException {
          stream.seek(newPos);
        }
      };
    }
  };
}
 
Example #3
Source File: TableMetadata.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private List<MetadataLogEntry> addPreviousFile(InputFile previousFile, long timestampMillis,
                                               Map<String, String> updatedProperties) {
  if (previousFile == null) {
    return previousFiles;
  }

  int maxSize = Math.max(1, PropertyUtil.propertyAsInt(updatedProperties,
          TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, TableProperties.METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT));

  List<MetadataLogEntry> newMetadataLog;
  if (previousFiles.size() >= maxSize) {
    int removeIndex = previousFiles.size() - maxSize + 1;
    newMetadataLog = Lists.newArrayList(previousFiles.subList(removeIndex, previousFiles.size()));
  } else {
    newMetadataLog = Lists.newArrayList(previousFiles);
  }
  newMetadataLog.add(new MetadataLogEntry(timestampMillis, previousFile.location()));

  return newMetadataLog;
}
 
Example #4
Source File: GenericManifestFile.java    From iceberg with Apache License 2.0 6 votes vote down vote up
GenericManifestFile(InputFile file, int specId) {
  this.avroSchema = AVRO_SCHEMA;
  this.file = file;
  this.manifestPath = file.location();
  this.length = null; // lazily loaded from file
  this.specId = specId;
  this.sequenceNumber = 0;
  this.minSequenceNumber = 0;
  this.snapshotId = null;
  this.addedFilesCount = null;
  this.addedRowsCount = null;
  this.existingFilesCount = null;
  this.existingRowsCount = null;
  this.deletedFilesCount = null;
  this.deletedRowsCount = null;
  this.partitions = null;
  this.fromProjectionPos = null;
}
 
Example #5
Source File: OrcIterable.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private static VectorizedRowBatchIterator newOrcIterator(InputFile file,
                                                         TypeDescription readerSchema,
                                                         Long start, Long length,
                                                         Reader orcFileReader, SearchArgument sarg) {
  final Reader.Options options = orcFileReader.options();
  if (start != null) {
    options.range(start, length);
  }
  options.schema(readerSchema);
  options.searchArgument(sarg, new String[]{});

  try {
    return new VectorizedRowBatchIterator(file.location(), readerSchema, orcFileReader.rows(options));
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to get ORC rows for file: %s", file);
  }
}
 
Example #6
Source File: TestMetrics.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testMetricsForNullColumns() throws IOException {
  Schema schema = new Schema(
      optional(1, "intCol", IntegerType.get())
  );
  Record firstRecord = GenericRecord.create(schema);
  firstRecord.setField("intCol", null);
  Record secondRecord = GenericRecord.create(schema);
  secondRecord.setField("intCol", null);

  InputFile recordsFile = writeRecords(schema, firstRecord, secondRecord);

  Metrics metrics = getMetrics(recordsFile);
  Assert.assertEquals(2L, (long) metrics.recordCount());
  assertCounts(1, 2L, 2L, metrics);
  assertBounds(1, IntegerType.get(), null, null, metrics);
}
 
Example #7
Source File: TestManifestListVersions.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testV1ForwardCompatibility() throws IOException {
  InputFile manifestList = writeManifestList(TEST_MANIFEST, 1);
  GenericData.Record generic = readGeneric(manifestList, V1Metadata.MANIFEST_LIST_SCHEMA);

  // v1 metadata should match even though order changed
  Assert.assertEquals("Path", PATH, generic.get("manifest_path").toString());
  Assert.assertEquals("Length", LENGTH, generic.get("manifest_length"));
  Assert.assertEquals("Spec id", SPEC_ID, generic.get("partition_spec_id"));
  Assert.assertEquals("Snapshot id", SNAPSHOT_ID, (long) generic.get("added_snapshot_id"));
  Assert.assertEquals("Added files count", ADDED_FILES, (int) generic.get("added_data_files_count"));
  Assert.assertEquals("Existing files count", EXISTING_FILES, (int) generic.get("existing_data_files_count"));
  Assert.assertEquals("Deleted files count", DELETED_FILES, (int) generic.get("deleted_data_files_count"));
  Assert.assertEquals("Added rows count", ADDED_ROWS, (long) generic.get("added_rows_count"));
  Assert.assertEquals("Existing rows count", EXISTING_ROWS, (long) generic.get("existing_rows_count"));
  Assert.assertEquals("Deleted rows count", DELETED_ROWS, (long) generic.get("deleted_rows_count"));
  Assert.assertNull("Content", generic.get(ManifestFile.MANIFEST_CONTENT.name()));
  Assert.assertNull("Sequence number", generic.get(ManifestFile.SEQUENCE_NUMBER.name()));
  Assert.assertNull("Min sequence number", generic.get(ManifestFile.MIN_SEQUENCE_NUMBER.name()));
}
 
Example #8
Source File: RowDataReader.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private CloseableIterable<InternalRow> newParquetIterable(
    InputFile location,
    FileScanTask task,
    Schema readSchema,
    Map<Integer, ?> idToConstant) {
  Parquet.ReadBuilder builder = Parquet.read(location)
      .split(task.start(), task.length())
      .project(readSchema)
      .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant))
      .filter(task.residual())
      .caseSensitive(caseSensitive);

  if (nameMapping != null) {
    builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
  }

  return builder.build();
}
 
Example #9
Source File: TestManifestListVersions.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testV2ForwardCompatibility() throws IOException {
  // v2 manifest list files can be read by v1 readers, but the sequence numbers and content will be ignored.
  InputFile manifestList = writeManifestList(TEST_MANIFEST, 2);
  GenericData.Record generic = readGeneric(manifestList, V1Metadata.MANIFEST_LIST_SCHEMA);

  // v1 metadata should match even though order changed
  Assert.assertEquals("Path", PATH, generic.get("manifest_path").toString());
  Assert.assertEquals("Length", LENGTH, generic.get("manifest_length"));
  Assert.assertEquals("Spec id", SPEC_ID, generic.get("partition_spec_id"));
  Assert.assertEquals("Snapshot id", SNAPSHOT_ID, (long) generic.get("added_snapshot_id"));
  Assert.assertEquals("Added files count", ADDED_FILES, (int) generic.get("added_data_files_count"));
  Assert.assertEquals("Existing files count", EXISTING_FILES, (int) generic.get("existing_data_files_count"));
  Assert.assertEquals("Deleted files count", DELETED_FILES, (int) generic.get("deleted_data_files_count"));
  Assert.assertEquals("Added rows count", ADDED_ROWS, (long) generic.get("added_rows_count"));
  Assert.assertEquals("Existing rows count", EXISTING_ROWS, (long) generic.get("existing_rows_count"));
  Assert.assertEquals("Deleted rows count", DELETED_ROWS, (long) generic.get("deleted_rows_count"));
  Assert.assertNull("Content", generic.get(ManifestFile.MANIFEST_CONTENT.name()));
  Assert.assertNull("Sequence number", generic.get(ManifestFile.SEQUENCE_NUMBER.name()));
  Assert.assertNull("Min sequence number", generic.get(ManifestFile.MIN_SEQUENCE_NUMBER.name()));
}
 
Example #10
Source File: ManifestLists.java    From iceberg with Apache License 2.0 6 votes vote down vote up
static List<ManifestFile> read(InputFile manifestList) {
  try (CloseableIterable<ManifestFile> files = Avro.read(manifestList)
      .rename("manifest_file", GenericManifestFile.class.getName())
      .rename("partitions", GenericPartitionFieldSummary.class.getName())
      .rename("r508", GenericPartitionFieldSummary.class.getName())
      .classLoader(GenericManifestFile.class.getClassLoader())
      .project(ManifestFile.schema())
      .reuseContainers(false)
      .build()) {

    return Lists.newLinkedList(files);

  } catch (IOException e) {
    throw new RuntimeIOException(e, "Cannot read manifest list file: %s", manifestList.location());
  }
}
 
Example #11
Source File: IcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> newOrcIterable(InputFile inputFile, FileScanTask task, Schema readSchema) {
  ORC.ReadBuilder orcReadBuilder = ORC.read(inputFile)
      .project(readSchema)
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .split(task.start(), task.length());
  // ORC does not support reuse containers yet
  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO: implement value readers for Pig and Hive
      throw new UnsupportedOperationException("ORC support not yet supported for Pig and Hive");
    case GENERIC:
      orcReadBuilder.createReaderFunc(
          fileSchema -> GenericOrcReader.buildReader(
              readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }

  return applyResidualFiltering(orcReadBuilder.build(), task.residual(), readSchema);
}
 
Example #12
Source File: IcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> newParquetIterable(InputFile inputFile, FileScanTask task, Schema readSchema) {
  Parquet.ReadBuilder parquetReadBuilder = Parquet.read(inputFile)
      .project(readSchema)
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .split(task.start(), task.length());
  if (reuseContainers) {
    parquetReadBuilder.reuseContainers();
  }

  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO implement value readers for Pig and Hive
      throw new UnsupportedOperationException("Parquet support not yet supported for Pig and Hive");
    case GENERIC:
      parquetReadBuilder.createReaderFunc(
          fileSchema -> GenericParquetReaders.buildReader(
              readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }
  return applyResidualFiltering(parquetReadBuilder.build(), task.residual(), readSchema);
}
 
Example #13
Source File: IcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> newAvroIterable(
    InputFile inputFile, FileScanTask task, Schema readSchema) {
  Avro.ReadBuilder avroReadBuilder = Avro.read(inputFile)
      .project(readSchema)
      .split(task.start(), task.length());
  if (reuseContainers) {
    avroReadBuilder.reuseContainers();
  }

  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO implement value readers for Pig and Hive
      throw new UnsupportedOperationException("Avro support not yet supported for Pig and Hive");
    case GENERIC:
      avroReadBuilder.createReaderFunc(
          (expIcebergSchema, expAvroSchema) ->
              DataReader.create(expIcebergSchema, expAvroSchema,
                  constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }
  return applyResidualFiltering(avroReadBuilder.build(), task.residual(), readSchema);
}
 
Example #14
Source File: IcebergInputFormat.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private CloseableIterable<T> open(FileScanTask currentTask, Schema readSchema) {
  DataFile file = currentTask.file();
  // TODO we should make use of FileIO to create inputFile
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());
  CloseableIterable<T> iterable;
  switch (file.format()) {
    case AVRO:
      iterable = newAvroIterable(inputFile, currentTask, readSchema);
      break;
    case ORC:
      iterable = newOrcIterable(inputFile, currentTask, readSchema);
      break;
    case PARQUET:
      iterable = newParquetIterable(inputFile, currentTask, readSchema);
      break;
    default:
      throw new UnsupportedOperationException(
          String.format("Cannot read %s file: %s", file.format().name(), file.path()));
  }

  return iterable;
}
 
Example #15
Source File: TestMetricsRowGroupFilterTypes.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public void createOrcInputFile(List<Record> records) throws IOException {
  if (ORC_FILE.exists()) {
    Assert.assertTrue(ORC_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(ORC_FILE);
  try (FileAppender<Record> appender = ORC.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    appender.addAll(records);
  }

  InputFile inFile = Files.localInput(ORC_FILE);
  try (Reader reader = OrcFile.createReader(new Path(inFile.location()),
      OrcFile.readerOptions(new Configuration()))) {
    Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size());
  }

  ORC_FILE.deleteOnExit();
}
 
Example #16
Source File: TestMetricsRowGroupFilterTypes.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public void createParquetInputFile(List<Record> records) throws IOException {
  if (PARQUET_FILE.exists()) {
    Assert.assertTrue(PARQUET_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(PARQUET_FILE);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericParquetWriter::buildWriter)
      .build()) {
    appender.addAll(records);
  }

  InputFile inFile = Files.localInput(PARQUET_FILE);
  try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) {
    Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
    rowGroupMetadata = reader.getRowGroups().get(0);
    parquetSchema = reader.getFileMetaData().getSchema();
  }

  PARQUET_FILE.deleteOnExit();
}
 
Example #17
Source File: RowDataReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private CloseableIterable<InternalRow> open(FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) {
  CloseableIterable<InternalRow> iter;
  if (task.isDataTask()) {
    iter = newDataIterable(task.asDataTask(), readSchema);
  } else {
    InputFile location = getInputFile(task);
    Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");

    switch (task.file().format()) {
      case PARQUET:
        iter = newParquetIterable(location, task, readSchema, idToConstant);
        break;

      case AVRO:
        iter = newAvroIterable(location, task, readSchema, idToConstant);
        break;

      case ORC:
        iter = newOrcIterable(location, task, readSchema, idToConstant);
        break;

      default:
        throw new UnsupportedOperationException(
            "Cannot read unknown format: " + task.file().format());
    }
  }

  return iter;
}
 
Example #18
Source File: Spark3Util.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static boolean isLocalityEnabled(FileIO io, String location, CaseInsensitiveStringMap readOptions) {
  InputFile in = io.newInputFile(location);
  if (in instanceof HadoopInputFile) {
    String scheme = ((HadoopInputFile) in).getFileSystem().getScheme();
    return readOptions.getBoolean("locality", LOCALITY_WHITELIST_FS.contains(scheme));
  }
  return false;
}
 
Example #19
Source File: DataFiles.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static DataFile fromInputFile(InputFile file, PartitionData partition, long rowCount) {
  if (file instanceof HadoopInputFile) {
    return fromStat(((HadoopInputFile) file).getStat(), partition, rowCount);
  }

  String location = file.location();
  FileFormat format = FileFormat.fromFileName(location);
  return new GenericDataFile(
      location, format, partition, rowCount, file.getLength());
}
 
Example #20
Source File: ReadConf.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static ParquetFileReader newReader(InputFile file, ParquetReadOptions options) {
  try {
    return ParquetFileReader.open(ParquetIO.file(file), options);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to open Parquet file: %s", file.location());
  }
}
 
Example #21
Source File: ParquetReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public ParquetReader(InputFile input, Schema expectedSchema, ParquetReadOptions options,
                     Function<MessageType, ParquetValueReader<?>> readerFunc, NameMapping nameMapping,
                     Expression filter, boolean reuseContainers, boolean caseSensitive) {
  this.input = input;
  this.expectedSchema = expectedSchema;
  this.options = options;
  this.readerFunc = readerFunc;
  // replace alwaysTrue with null to avoid extra work evaluating a trivial filter
  this.filter = filter == Expressions.alwaysTrue() ? null : filter;
  this.reuseContainers = reuseContainers;
  this.caseSensitive = caseSensitive;
  this.nameMapping = nameMapping;
}
 
Example #22
Source File: BaseRewriteManifests.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private ManifestFile copyManifest(ManifestFile manifest) {
  TableMetadata current = ops.current();
  InputFile toCopy = ops.io().newInputFile(manifest.path());
  OutputFile newFile = newManifestOutput();
  return ManifestFiles.copyRewriteManifest(
      current.formatVersion(), toCopy, specsById, newFile, snapshotId(), summaryBuilder);
}
 
Example #23
Source File: RowDataReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private CloseableIterable<InternalRow> newOrcIterable(
    InputFile location,
    FileScanTask task,
    Schema readSchema,
    Map<Integer, ?> idToConstant) {
  return ORC.read(location)
      .project(readSchema)
      .split(task.start(), task.length())
      .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant))
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .build();
}
 
Example #24
Source File: DataFiles.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static DataFile fromInputFile(InputFile file, long rowCount) {
  if (file instanceof HadoopInputFile) {
    return fromStat(((HadoopInputFile) file).getStat(), rowCount);
  }

  String location = file.location();
  FileFormat format = FileFormat.fromFileName(location);
  return new GenericDataFile(location, format, rowCount, file.getLength());
}
 
Example #25
Source File: TestManifestListVersions.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private InputFile writeManifestList(ManifestFile manifest, int formatVersion) throws IOException {
  OutputFile manifestList = Files.localOutput(temp.newFile());
  try (FileAppender<ManifestFile> writer = ManifestLists.write(
      formatVersion, manifestList, SNAPSHOT_ID, SNAPSHOT_ID - 1, formatVersion > 1 ? SEQ_NUM : 0)) {
    writer.add(manifest);
  }
  return manifestList.toInputFile();
}
 
Example #26
Source File: ManifestFiles.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static ManifestFile copyRewriteManifest(int formatVersion,
                                        InputFile toCopy, Map<Integer, PartitionSpec> specsById,
                                        OutputFile outputFile, long snapshotId,
                                        SnapshotSummary.Builder summaryBuilder) {
  // for a rewritten manifest all snapshot ids should be set. use empty metadata to throw an exception if it is not
  InheritableMetadata inheritableMetadata = InheritableMetadataFactory.empty();
  try (ManifestReader<DataFile> reader =
           new ManifestReader<>(toCopy, specsById, inheritableMetadata, FileType.DATA_FILES)) {
    return copyManifestInternal(
        formatVersion, reader, outputFile, snapshotId, summaryBuilder, ManifestEntry.Status.EXISTING);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to close manifest: %s", toCopy.location());
  }
}
 
Example #27
Source File: AvroIterable.java    From iceberg with Apache License 2.0 5 votes vote down vote up
AvroIterable(InputFile file, DatumReader<D> reader,
             Long start, Long length, boolean reuseContainers) {
  this.file = file;
  this.reader = reader;
  this.start = start;
  this.end = start != null ? start + length : null;
  this.reuseContainers = reuseContainers;
}
 
Example #28
Source File: TableMetadataParser.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static TableMetadata read(FileIO io, InputFile file) {
  Codec codec = Codec.fromFileName(file.location());
  try (InputStream is = codec == Codec.GZIP ? new GZIPInputStream(file.newStream()) : file.newStream()) {
    return fromJson(io, file, JsonUtil.mapper().readValue(is, JsonNode.class));
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to read file: %s", file);
  }
}
 
Example #29
Source File: ManifestFiles.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static ManifestFile copyAppendManifest(int formatVersion,
                                       InputFile toCopy, Map<Integer, PartitionSpec> specsById,
                                       OutputFile outputFile, long snapshotId,
                                       SnapshotSummary.Builder summaryBuilder) {
  // use metadata that will add the current snapshot's ID for the rewrite
  InheritableMetadata inheritableMetadata = InheritableMetadataFactory.forCopy(snapshotId);
  try (ManifestReader<DataFile> reader =
           new ManifestReader<>(toCopy, specsById, inheritableMetadata, FileType.DATA_FILES)) {
    return copyManifestInternal(
        formatVersion, reader, outputFile, snapshotId, summaryBuilder, ManifestEntry.Status.ADDED);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to close manifest: %s", toCopy.location());
  }
}
 
Example #30
Source File: StaticDataTask.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private StaticDataTask(InputFile metadata, StructLike[] rows) {
  this.metadataFile = DataFiles.builder()
      .withInputFile(metadata)
      .withRecordCount(rows.length)
      .withFormat(FileFormat.METADATA)
      .build();
  this.rows = rows;
}