Java Code Examples for org.apache.iceberg.Metrics

The following examples show how to use org.apache.iceberg.Metrics. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: presto   Source File: MetricsWrapper.java    License: Apache License 2.0 6 votes vote down vote up
@JsonCreator
public MetricsWrapper(
        @JsonProperty("recordCount") Long recordCount,
        @JsonProperty("columnSizes") Map<Integer, Long> columnSizes,
        @JsonProperty("valueCounts") Map<Integer, Long> valueCounts,
        @JsonProperty("nullValueCounts") Map<Integer, Long> nullValueCounts,
        @JsonProperty("lowerBounds") Map<Integer, ByteBuffer> lowerBounds,
        @JsonProperty("upperBounds") Map<Integer, ByteBuffer> upperBounds)
{
    this(new Metrics(
            recordCount,
            columnSizes,
            valueCounts,
            nullValueCounts,
            lowerBounds,
            upperBounds));
}
 
Example 2
Source Project: presto   Source File: TestMetricsWrapper.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testRoundTrip()
{
    Long recordCount = 123L;
    Map<Integer, Long> columnSizes = ImmutableMap.of(3, 321L, 5, 543L);
    Map<Integer, Long> valueCounts = ImmutableMap.of(7, 765L, 9, 987L);
    Map<Integer, Long> nullValueCounts = ImmutableMap.of(2, 234L, 4, 456L);
    Map<Integer, ByteBuffer> lowerBounds = ImmutableMap.of(13, ByteBuffer.wrap(new byte[] {0, 8, 9}));
    Map<Integer, ByteBuffer> upperBounds = ImmutableMap.of(17, ByteBuffer.wrap(new byte[] {5, 4, 0}));

    Metrics expected = new Metrics(recordCount, columnSizes, valueCounts, nullValueCounts, lowerBounds, upperBounds);

    Metrics actual = CODEC.fromJson(CODEC.toJson(new MetricsWrapper(expected))).metrics();

    assertEquals(actual.recordCount(), recordCount);
    assertEquals(actual.columnSizes(), columnSizes);
    assertEquals(actual.valueCounts(), valueCounts);
    assertEquals(actual.nullValueCounts(), nullValueCounts);
    assertEquals(actual.lowerBounds(), lowerBounds);
    assertEquals(actual.upperBounds(), upperBounds);
}
 
Example 3
Source Project: iceberg   Source File: BaseWriter.java    License: Apache License 2.0 6 votes vote down vote up
protected void closeCurrent() throws IOException {
  if (currentAppender != null) {
    currentAppender.close();
    // metrics are only valid after the appender is closed
    Metrics metrics = currentAppender.metrics();
    long fileSizeInBytes = currentAppender.length();
    List<Long> splitOffsets = currentAppender.splitOffsets();
    this.currentAppender = null;

    if (metrics.recordCount() == 0L) {
      io.deleteFile(currentFile.encryptingOutputFile());
    } else {
      DataFile dataFile = DataFiles.builder(spec)
          .withEncryptionKeyMetadata(currentFile.keyMetadata())
          .withPath(currentFile.encryptingOutputFile().location())
          .withFileSizeInBytes(fileSizeInBytes)
          .withPartition(spec.fields().size() == 0 ? null : currentKey) // set null if unpartitioned
          .withMetrics(metrics)
          .withSplitOffsets(splitOffsets)
          .build();
      completedFiles.add(dataFile);
    }

    this.currentFile = null;
  }
}
 
Example 4
Source Project: dremio-oss   Source File: ParquetRecordWriter.java    License: Apache License 2.0 6 votes vote down vote up
private byte[] getIcebergMetaData() throws IOException {
  if (!this.isIcebergWriter) {
    return null;
  }

  final long fileSize = parquetFileWriter.getPos();
  DataFiles.Builder dataFileBuilder =
    DataFiles.builder(IcebergCatalog.getIcebergPartitionSpec(this.batchSchema, this.partitionColumns))
      .withPath(path.toString())
      .withFileSizeInBytes(fileSize)
      .withRecordCount(recordCount)
      .withFormat(FileFormat.PARQUET);

  // add partition info
  if (partitionColumns != null) {
    dataFileBuilder = dataFileBuilder.withPartition(partition.getIcebergPartitionData());
  }

  // add column level metrics
  Metrics metrics = footerMetricsToIcebergMetrics(parquetFileWriter.getFooter(), batchSchema);
  dataFileBuilder = dataFileBuilder.withMetrics(metrics);
  return IcebergSerDe.serializeDataFile(dataFileBuilder.build());
}
 
Example 5
Source Project: presto   Source File: IcebergPageSink.java    License: Apache License 2.0 5 votes vote down vote up
private Metrics getMetrics(WriteContext writeContext)
{
    switch (fileFormat) {
        case PARQUET:
            return ParquetUtil.fileMetrics(HadoopInputFile.fromPath(writeContext.getPath(), jobConf), MetricsConfig.getDefault());
        case ORC:
            return writeContext.getWriter().getMetrics()
                    .orElseThrow(() -> new VerifyException("Iceberg ORC file writers should return Iceberg metrics"));
    }
    throw new PrestoException(NOT_SUPPORTED, "File format not supported for Iceberg: " + fileFormat);
}
 
Example 6
Source Project: presto   Source File: TestMetricsWrapper.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testAllPropertiesHandled()
{
    Set<String> properties = getJsonProperties(MetricsWrapper.class);
    for (Method method : Metrics.class.getMethods()) {
        if (method.getDeclaringClass().equals(Method.class)) {
            assertTrue(properties.contains(method.getName()), "Metrics method not in wrapper: " + method);
        }
    }
}
 
Example 7
Source Project: iceberg   Source File: OrcMetrics.java    License: Apache License 2.0 5 votes vote down vote up
static Metrics fromInputFile(InputFile file, Configuration config) {
  try (Reader orcReader = ORC.newFileReader(file, config)) {
    return buildOrcMetrics(orcReader.getNumberOfRows(), orcReader.getSchema(), orcReader.getStatistics());
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to open file: %s", file.location());
  }
}
 
Example 8
Source Project: iceberg   Source File: OrcMetrics.java    License: Apache License 2.0 5 votes vote down vote up
static Metrics fromWriter(Writer writer) {
  try {
    return buildOrcMetrics(writer.getNumberOfRows(), writer.getSchema(), writer.getStatistics());
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to get statistics from writer");
  }
}
 
Example 9
Source Project: iceberg   Source File: TestOrcMetrics.java    License: Apache License 2.0 5 votes vote down vote up
@Override
protected <T> void assertBounds(int fieldId, Type type, T lowerBound, T upperBound, Metrics metrics) {
  if (isBinaryType(type)) {
    Assert.assertFalse("ORC binary field should not have lower bounds.",
        metrics.lowerBounds().containsKey(fieldId));
    Assert.assertFalse("ORC binary field should not have upper bounds.",
        metrics.lowerBounds().containsKey(fieldId));
    return;
  }
  super.assertBounds(fieldId, type, lowerBound, upperBound, metrics);
}
 
Example 10
Source Project: iceberg   Source File: SparkTableUtil.java    License: Apache License 2.0 5 votes vote down vote up
private static List<DataFile> listAvroPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);
    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = new Metrics(-1L, null, null, null);
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("avro")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}
 
Example 11
Source Project: iceberg   Source File: SparkTableUtil.java    License: Apache License 2.0 5 votes vote down vote up
private static List<DataFile> listOrcPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);

    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = OrcMetrics.fromInputFile(HadoopInputFile.fromPath(stat.getPath(), conf));
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("orc")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}
 
Example 12
Source Project: iceberg   Source File: ParquetUtil.java    License: Apache License 2.0 5 votes vote down vote up
public static Metrics fileMetrics(InputFile file, MetricsConfig metricsConfig) {
  try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(file))) {
    return footerMetrics(reader.getFooter(), metricsConfig);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to read footer of file: %s", file);
  }
}
 
Example 13
Source Project: presto   Source File: IcebergRecordFileWriter.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Optional<Metrics> getMetrics()
{
    return Optional.empty();
}
 
Example 14
Source Project: presto   Source File: IcebergOrcFileWriter.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Optional<Metrics> getMetrics()
{
    return Optional.of(computeMetrics(icebergSchema, orcColumns, orcWriter.getFileRowCount(), orcWriter.getFileStats()));
}
 
Example 15
Source Project: presto   Source File: IcebergOrcFileWriter.java    License: Apache License 2.0 4 votes vote down vote up
private static Metrics computeMetrics(Schema icebergSchema, ColumnMetadata<OrcType> orcColumns, long fileRowCount, Optional<ColumnMetadata<ColumnStatistics>> columnStatistics)
{
    if (columnStatistics.isEmpty()) {
        return new Metrics(fileRowCount, null, null, null, null, null);
    }
    // Columns that are descendants of LIST or MAP types are excluded because:
    // 1. Their stats are not used by Apache Iceberg to filter out data files
    // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
    // See https://github.com/apache/iceberg/pull/199#discussion_r429443627
    Set<OrcColumnId> excludedColumns = getExcludedColumns(orcColumns);

    ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();

    // OrcColumnId(0) is the root column that represents file-level schema
    for (int i = 1; i < orcColumns.size(); i++) {
        OrcColumnId orcColumnId = new OrcColumnId(i);
        if (excludedColumns.contains(orcColumnId)) {
            continue;
        }
        OrcType orcColumn = orcColumns.get(orcColumnId);
        ColumnStatistics orcColumnStats = columnStatistics.get().get(orcColumnId);
        int icebergId = getIcebergId(orcColumn);
        Types.NestedField icebergField = icebergSchema.findField(icebergId);
        verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
        valueCountsBuilder.put(icebergId, fileRowCount);
        if (orcColumnStats.hasNumberOfValues()) {
            nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
        }
        toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
            lowerBoundsBuilder.put(icebergId, minMax.getMin());
            upperBoundsBuilder.put(icebergId, minMax.getMax());
        });
    }
    Map<Integer, Long> valueCounts = valueCountsBuilder.build();
    Map<Integer, Long> nullCounts = nullCountsBuilder.build();
    Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
    Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
    return new Metrics(
            fileRowCount,
            null, // TODO: Add column size accounting to ORC column writers
            valueCounts.isEmpty() ? null : valueCounts,
            nullCounts.isEmpty() ? null : nullCounts,
            lowerBounds.isEmpty() ? null : lowerBounds,
            upperBounds.isEmpty() ? null : upperBounds);
}
 
Example 16
Source Project: presto   Source File: MetricsWrapper.java    License: Apache License 2.0 4 votes vote down vote up
public MetricsWrapper(Metrics metrics)
{
    this.metrics = requireNonNull(metrics, "metrics is null");
}
 
Example 17
Source Project: presto   Source File: MetricsWrapper.java    License: Apache License 2.0 4 votes vote down vote up
public Metrics metrics()
{
    return metrics;
}
 
Example 18
Source Project: iceberg   Source File: OrcMetrics.java    License: Apache License 2.0 4 votes vote down vote up
public static Metrics fromInputFile(InputFile file) {
  final Configuration config = (file instanceof HadoopInputFile) ?
      ((HadoopInputFile) file).getConf() : new Configuration();
  return fromInputFile(file, config);
}
 
Example 19
Source Project: iceberg   Source File: OrcMetrics.java    License: Apache License 2.0 4 votes vote down vote up
private static Metrics buildOrcMetrics(final long numOfRows, final TypeDescription orcSchema,
                                       final ColumnStatistics[] colStats) {
  final Schema schema = ORCSchemaUtil.convert(orcSchema);
  final Set<TypeDescription> columnsInContainers = findColumnsInContainers(schema, orcSchema);
  Map<Integer, Long> columnSizes = Maps.newHashMapWithExpectedSize(colStats.length);
  Map<Integer, Long> valueCounts = Maps.newHashMapWithExpectedSize(colStats.length);
  Map<Integer, Long> nullCounts = Maps.newHashMapWithExpectedSize(colStats.length);
  Map<Integer, ByteBuffer> lowerBounds = Maps.newHashMap();
  Map<Integer, ByteBuffer> upperBounds = Maps.newHashMap();

  for (int i = 0; i < colStats.length; i++) {
    final ColumnStatistics colStat = colStats[i];
    final TypeDescription orcCol = orcSchema.findSubtype(i);
    final Optional<Types.NestedField> icebergColOpt = ORCSchemaUtil.icebergID(orcCol)
        .map(schema::findField);

    if (icebergColOpt.isPresent()) {
      final Types.NestedField icebergCol = icebergColOpt.get();
      final int fieldId = icebergCol.fieldId();

      columnSizes.put(fieldId, colStat.getBytesOnDisk());

      if (!columnsInContainers.contains(orcCol)) {
        // Since ORC does not track null values nor repeated ones, the value count for columns in
        // containers (maps, list) may be larger than what it actually is, however these are not
        // used in experssions right now. For such cases, we use the value number of values
        // directly stored in ORC.
        if (colStat.hasNull()) {
          nullCounts.put(fieldId, numOfRows - colStat.getNumberOfValues());
        } else {
          nullCounts.put(fieldId, 0L);
        }
        valueCounts.put(fieldId, colStat.getNumberOfValues() + nullCounts.get(fieldId));

        Optional<ByteBuffer> orcMin = (colStat.getNumberOfValues() > 0) ?
            fromOrcMin(icebergCol, colStat) : Optional.empty();
        orcMin.ifPresent(byteBuffer -> lowerBounds.put(icebergCol.fieldId(), byteBuffer));
        Optional<ByteBuffer> orcMax = (colStat.getNumberOfValues() > 0) ?
            fromOrcMax(icebergCol, colStat) : Optional.empty();
        orcMax.ifPresent(byteBuffer -> upperBounds.put(icebergCol.fieldId(), byteBuffer));
      }
    }
  }

  return new Metrics(numOfRows,
      columnSizes,
      valueCounts,
      nullCounts,
      lowerBounds,
      upperBounds);
}
 
Example 20
Source Project: iceberg   Source File: OrcFileAppender.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Metrics metrics() {
  Preconditions.checkState(isClosed,
      "Cannot return metrics while appending to an open file.");
  return OrcMetrics.fromWriter(writer);
}
 
Example 21
Source Project: iceberg   Source File: TestOrcMetrics.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Metrics getMetrics(InputFile file) {
  return OrcMetrics.fromInputFile(file);
}
 
Example 22
Source Project: iceberg   Source File: TestParquetMetrics.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Metrics getMetrics(InputFile file) {
  return ParquetUtil.fileMetrics(file);
}
 
Example 23
Source Project: iceberg   Source File: ParquetWriteAdapter.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Metrics metrics() {
  Preconditions.checkState(footer != null, "Cannot produce metrics until closed");
  return ParquetUtil.footerMetrics(footer, metricsConfig);
}
 
Example 24
Source Project: iceberg   Source File: ParquetUtil.java    License: Apache License 2.0 4 votes vote down vote up
static Metrics fileMetrics(InputFile file) {
  return fileMetrics(file, MetricsConfig.getDefault());
}
 
Example 25
Source Project: iceberg   Source File: ParquetWriter.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Metrics metrics() {
  return ParquetUtil.footerMetrics(writer.getFooter(), metricsConfig);
}
 
Example 26
Source Project: iceberg   Source File: AvroFileAppender.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public Metrics metrics() {
  return new Metrics(numRecords, null, null, null);
}
 
Example 27
Source Project: iceberg   Source File: FileAppender.java    License: Apache License 2.0 2 votes vote down vote up
/**
 * @return {@link Metrics} for this file. Only valid after the file is closed.
 */
Metrics metrics();
 
Example 28
Source Project: presto   Source File: IcebergFileWriter.java    License: Apache License 2.0 votes vote down vote up
Optional<Metrics> getMetrics();