org.apache.iceberg.Metrics Java Examples

The following examples show how to use org.apache.iceberg.Metrics. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetRecordWriter.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
private byte[] getIcebergMetaData() throws IOException {
  if (!this.isIcebergWriter) {
    return null;
  }

  final long fileSize = parquetFileWriter.getPos();
  DataFiles.Builder dataFileBuilder =
    DataFiles.builder(IcebergCatalog.getIcebergPartitionSpec(this.batchSchema, this.partitionColumns))
      .withPath(path.toString())
      .withFileSizeInBytes(fileSize)
      .withRecordCount(recordCount)
      .withFormat(FileFormat.PARQUET);

  // add partition info
  if (partitionColumns != null) {
    dataFileBuilder = dataFileBuilder.withPartition(partition.getIcebergPartitionData());
  }

  // add column level metrics
  Metrics metrics = footerMetricsToIcebergMetrics(parquetFileWriter.getFooter(), batchSchema);
  dataFileBuilder = dataFileBuilder.withMetrics(metrics);
  return IcebergSerDe.serializeDataFile(dataFileBuilder.build());
}
 
Example #2
Source File: BaseWriter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
protected void closeCurrent() throws IOException {
  if (currentAppender != null) {
    currentAppender.close();
    // metrics are only valid after the appender is closed
    Metrics metrics = currentAppender.metrics();
    long fileSizeInBytes = currentAppender.length();
    List<Long> splitOffsets = currentAppender.splitOffsets();
    this.currentAppender = null;

    if (metrics.recordCount() == 0L) {
      io.deleteFile(currentFile.encryptingOutputFile());
    } else {
      DataFile dataFile = DataFiles.builder(spec)
          .withEncryptionKeyMetadata(currentFile.keyMetadata())
          .withPath(currentFile.encryptingOutputFile().location())
          .withFileSizeInBytes(fileSizeInBytes)
          .withPartition(spec.fields().size() == 0 ? null : currentKey) // set null if unpartitioned
          .withMetrics(metrics)
          .withSplitOffsets(splitOffsets)
          .build();
      completedFiles.add(dataFile);
    }

    this.currentFile = null;
  }
}
 
Example #3
Source File: MetricsWrapper.java    From presto with Apache License 2.0 6 votes vote down vote up
@JsonCreator
public MetricsWrapper(
        @JsonProperty("recordCount") Long recordCount,
        @JsonProperty("columnSizes") Map<Integer, Long> columnSizes,
        @JsonProperty("valueCounts") Map<Integer, Long> valueCounts,
        @JsonProperty("nullValueCounts") Map<Integer, Long> nullValueCounts,
        @JsonProperty("lowerBounds") Map<Integer, ByteBuffer> lowerBounds,
        @JsonProperty("upperBounds") Map<Integer, ByteBuffer> upperBounds)
{
    this(new Metrics(
            recordCount,
            columnSizes,
            valueCounts,
            nullValueCounts,
            lowerBounds,
            upperBounds));
}
 
Example #4
Source File: TestMetricsWrapper.java    From presto with Apache License 2.0 6 votes vote down vote up
@Test
public void testRoundTrip()
{
    Long recordCount = 123L;
    Map<Integer, Long> columnSizes = ImmutableMap.of(3, 321L, 5, 543L);
    Map<Integer, Long> valueCounts = ImmutableMap.of(7, 765L, 9, 987L);
    Map<Integer, Long> nullValueCounts = ImmutableMap.of(2, 234L, 4, 456L);
    Map<Integer, ByteBuffer> lowerBounds = ImmutableMap.of(13, ByteBuffer.wrap(new byte[] {0, 8, 9}));
    Map<Integer, ByteBuffer> upperBounds = ImmutableMap.of(17, ByteBuffer.wrap(new byte[] {5, 4, 0}));

    Metrics expected = new Metrics(recordCount, columnSizes, valueCounts, nullValueCounts, lowerBounds, upperBounds);

    Metrics actual = CODEC.fromJson(CODEC.toJson(new MetricsWrapper(expected))).metrics();

    assertEquals(actual.recordCount(), recordCount);
    assertEquals(actual.columnSizes(), columnSizes);
    assertEquals(actual.valueCounts(), valueCounts);
    assertEquals(actual.nullValueCounts(), nullValueCounts);
    assertEquals(actual.lowerBounds(), lowerBounds);
    assertEquals(actual.upperBounds(), upperBounds);
}
 
Example #5
Source File: TestOrcMetrics.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
protected <T> void assertBounds(int fieldId, Type type, T lowerBound, T upperBound, Metrics metrics) {
  if (isBinaryType(type)) {
    Assert.assertFalse("ORC binary field should not have lower bounds.",
        metrics.lowerBounds().containsKey(fieldId));
    Assert.assertFalse("ORC binary field should not have upper bounds.",
        metrics.lowerBounds().containsKey(fieldId));
    return;
  }
  super.assertBounds(fieldId, type, lowerBound, upperBound, metrics);
}
 
Example #6
Source File: SparkTableUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static List<DataFile> listAvroPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);
    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = new Metrics(-1L, null, null, null);
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("avro")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}
 
Example #7
Source File: SparkTableUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static List<DataFile> listOrcPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);

    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = OrcMetrics.fromInputFile(HadoopInputFile.fromPath(stat.getPath(), conf));
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("orc")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}
 
Example #8
Source File: OrcMetrics.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static Metrics fromWriter(Writer writer) {
  try {
    return buildOrcMetrics(writer.getNumberOfRows(), writer.getSchema(), writer.getStatistics());
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to get statistics from writer");
  }
}
 
Example #9
Source File: OrcMetrics.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static Metrics fromInputFile(InputFile file, Configuration config) {
  try (Reader orcReader = ORC.newFileReader(file, config)) {
    return buildOrcMetrics(orcReader.getNumberOfRows(), orcReader.getSchema(), orcReader.getStatistics());
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to open file: %s", file.location());
  }
}
 
Example #10
Source File: TestMetricsWrapper.java    From presto with Apache License 2.0 5 votes vote down vote up
@Test
public void testAllPropertiesHandled()
{
    Set<String> properties = getJsonProperties(MetricsWrapper.class);
    for (Method method : Metrics.class.getMethods()) {
        if (method.getDeclaringClass().equals(Method.class)) {
            assertTrue(properties.contains(method.getName()), "Metrics method not in wrapper: " + method);
        }
    }
}
 
Example #11
Source File: ParquetUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static Metrics fileMetrics(InputFile file, MetricsConfig metricsConfig) {
  try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(file))) {
    return footerMetrics(reader.getFooter(), metricsConfig);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to read footer of file: %s", file);
  }
}
 
Example #12
Source File: IcebergPageSink.java    From presto with Apache License 2.0 5 votes vote down vote up
private Metrics getMetrics(WriteContext writeContext)
{
    switch (fileFormat) {
        case PARQUET:
            return ParquetUtil.fileMetrics(HadoopInputFile.fromPath(writeContext.getPath(), jobConf), MetricsConfig.getDefault());
        case ORC:
            return writeContext.getWriter().getMetrics()
                    .orElseThrow(() -> new VerifyException("Iceberg ORC file writers should return Iceberg metrics"));
    }
    throw new PrestoException(NOT_SUPPORTED, "File format not supported for Iceberg: " + fileFormat);
}
 
Example #13
Source File: ParquetWriteAdapter.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public Metrics metrics() {
  Preconditions.checkState(footer != null, "Cannot produce metrics until closed");
  return ParquetUtil.footerMetrics(footer, metricsConfig);
}
 
Example #14
Source File: ParquetUtil.java    From iceberg with Apache License 2.0 4 votes vote down vote up
static Metrics fileMetrics(InputFile file) {
  return fileMetrics(file, MetricsConfig.getDefault());
}
 
Example #15
Source File: ParquetWriter.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public Metrics metrics() {
  return ParquetUtil.footerMetrics(writer.getFooter(), metricsConfig);
}
 
Example #16
Source File: AvroFileAppender.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public Metrics metrics() {
  return new Metrics(numRecords, null, null, null);
}
 
Example #17
Source File: TestParquetMetrics.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public Metrics getMetrics(InputFile file) {
  return ParquetUtil.fileMetrics(file);
}
 
Example #18
Source File: OrcFileAppender.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public Metrics metrics() {
  Preconditions.checkState(isClosed,
      "Cannot return metrics while appending to an open file.");
  return OrcMetrics.fromWriter(writer);
}
 
Example #19
Source File: TestOrcMetrics.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public Metrics getMetrics(InputFile file) {
  return OrcMetrics.fromInputFile(file);
}
 
Example #20
Source File: OrcMetrics.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private static Metrics buildOrcMetrics(final long numOfRows, final TypeDescription orcSchema,
                                       final ColumnStatistics[] colStats) {
  final Schema schema = ORCSchemaUtil.convert(orcSchema);
  final Set<TypeDescription> columnsInContainers = findColumnsInContainers(schema, orcSchema);
  Map<Integer, Long> columnSizes = Maps.newHashMapWithExpectedSize(colStats.length);
  Map<Integer, Long> valueCounts = Maps.newHashMapWithExpectedSize(colStats.length);
  Map<Integer, Long> nullCounts = Maps.newHashMapWithExpectedSize(colStats.length);
  Map<Integer, ByteBuffer> lowerBounds = Maps.newHashMap();
  Map<Integer, ByteBuffer> upperBounds = Maps.newHashMap();

  for (int i = 0; i < colStats.length; i++) {
    final ColumnStatistics colStat = colStats[i];
    final TypeDescription orcCol = orcSchema.findSubtype(i);
    final Optional<Types.NestedField> icebergColOpt = ORCSchemaUtil.icebergID(orcCol)
        .map(schema::findField);

    if (icebergColOpt.isPresent()) {
      final Types.NestedField icebergCol = icebergColOpt.get();
      final int fieldId = icebergCol.fieldId();

      columnSizes.put(fieldId, colStat.getBytesOnDisk());

      if (!columnsInContainers.contains(orcCol)) {
        // Since ORC does not track null values nor repeated ones, the value count for columns in
        // containers (maps, list) may be larger than what it actually is, however these are not
        // used in experssions right now. For such cases, we use the value number of values
        // directly stored in ORC.
        if (colStat.hasNull()) {
          nullCounts.put(fieldId, numOfRows - colStat.getNumberOfValues());
        } else {
          nullCounts.put(fieldId, 0L);
        }
        valueCounts.put(fieldId, colStat.getNumberOfValues() + nullCounts.get(fieldId));

        Optional<ByteBuffer> orcMin = (colStat.getNumberOfValues() > 0) ?
            fromOrcMin(icebergCol, colStat) : Optional.empty();
        orcMin.ifPresent(byteBuffer -> lowerBounds.put(icebergCol.fieldId(), byteBuffer));
        Optional<ByteBuffer> orcMax = (colStat.getNumberOfValues() > 0) ?
            fromOrcMax(icebergCol, colStat) : Optional.empty();
        orcMax.ifPresent(byteBuffer -> upperBounds.put(icebergCol.fieldId(), byteBuffer));
      }
    }
  }

  return new Metrics(numOfRows,
      columnSizes,
      valueCounts,
      nullCounts,
      lowerBounds,
      upperBounds);
}
 
Example #21
Source File: OrcMetrics.java    From iceberg with Apache License 2.0 4 votes vote down vote up
public static Metrics fromInputFile(InputFile file) {
  final Configuration config = (file instanceof HadoopInputFile) ?
      ((HadoopInputFile) file).getConf() : new Configuration();
  return fromInputFile(file, config);
}
 
Example #22
Source File: MetricsWrapper.java    From presto with Apache License 2.0 4 votes vote down vote up
public Metrics metrics()
{
    return metrics;
}
 
Example #23
Source File: MetricsWrapper.java    From presto with Apache License 2.0 4 votes vote down vote up
public MetricsWrapper(Metrics metrics)
{
    this.metrics = requireNonNull(metrics, "metrics is null");
}
 
Example #24
Source File: IcebergOrcFileWriter.java    From presto with Apache License 2.0 4 votes vote down vote up
private static Metrics computeMetrics(Schema icebergSchema, ColumnMetadata<OrcType> orcColumns, long fileRowCount, Optional<ColumnMetadata<ColumnStatistics>> columnStatistics)
{
    if (columnStatistics.isEmpty()) {
        return new Metrics(fileRowCount, null, null, null, null, null);
    }
    // Columns that are descendants of LIST or MAP types are excluded because:
    // 1. Their stats are not used by Apache Iceberg to filter out data files
    // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
    // See https://github.com/apache/iceberg/pull/199#discussion_r429443627
    Set<OrcColumnId> excludedColumns = getExcludedColumns(orcColumns);

    ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();

    // OrcColumnId(0) is the root column that represents file-level schema
    for (int i = 1; i < orcColumns.size(); i++) {
        OrcColumnId orcColumnId = new OrcColumnId(i);
        if (excludedColumns.contains(orcColumnId)) {
            continue;
        }
        OrcType orcColumn = orcColumns.get(orcColumnId);
        ColumnStatistics orcColumnStats = columnStatistics.get().get(orcColumnId);
        int icebergId = getIcebergId(orcColumn);
        Types.NestedField icebergField = icebergSchema.findField(icebergId);
        verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
        valueCountsBuilder.put(icebergId, fileRowCount);
        if (orcColumnStats.hasNumberOfValues()) {
            nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
        }
        toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
            lowerBoundsBuilder.put(icebergId, minMax.getMin());
            upperBoundsBuilder.put(icebergId, minMax.getMax());
        });
    }
    Map<Integer, Long> valueCounts = valueCountsBuilder.build();
    Map<Integer, Long> nullCounts = nullCountsBuilder.build();
    Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
    Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
    return new Metrics(
            fileRowCount,
            null, // TODO: Add column size accounting to ORC column writers
            valueCounts.isEmpty() ? null : valueCounts,
            nullCounts.isEmpty() ? null : nullCounts,
            lowerBounds.isEmpty() ? null : lowerBounds,
            upperBounds.isEmpty() ? null : upperBounds);
}
 
Example #25
Source File: IcebergOrcFileWriter.java    From presto with Apache License 2.0 4 votes vote down vote up
@Override
public Optional<Metrics> getMetrics()
{
    return Optional.of(computeMetrics(icebergSchema, orcColumns, orcWriter.getFileRowCount(), orcWriter.getFileStats()));
}
 
Example #26
Source File: IcebergRecordFileWriter.java    From presto with Apache License 2.0 4 votes vote down vote up
@Override
public Optional<Metrics> getMetrics()
{
    return Optional.empty();
}
 
Example #27
Source File: FileAppender.java    From iceberg with Apache License 2.0 2 votes vote down vote up
/**
 * @return {@link Metrics} for this file. Only valid after the file is closed.
 */
Metrics metrics();
 
Example #28
Source File: IcebergFileWriter.java    From presto with Apache License 2.0 votes vote down vote up
Optional<Metrics> getMetrics();