org.apache.iceberg.Metrics Java Exaples

Source File: ParquetRecordWriter.java From dremio-oss with Apache License 2.0

6 votes

private byte[] getIcebergMetaData() throws IOException {
  if (!this.isIcebergWriter) {
    return null;
  }

  final long fileSize = parquetFileWriter.getPos();
  DataFiles.Builder dataFileBuilder =
    DataFiles.builder(IcebergCatalog.getIcebergPartitionSpec(this.batchSchema, this.partitionColumns))
      .withPath(path.toString())
      .withFileSizeInBytes(fileSize)
      .withRecordCount(recordCount)
      .withFormat(FileFormat.PARQUET);

  // add partition info
  if (partitionColumns != null) {
    dataFileBuilder = dataFileBuilder.withPartition(partition.getIcebergPartitionData());
  }

  // add column level metrics
  Metrics metrics = footerMetricsToIcebergMetrics(parquetFileWriter.getFooter(), batchSchema);
  dataFileBuilder = dataFileBuilder.withMetrics(metrics);
  return IcebergSerDe.serializeDataFile(dataFileBuilder.build());
}

Source File: BaseWriter.java From iceberg with Apache License 2.0

6 votes

protected void closeCurrent() throws IOException {
  if (currentAppender != null) {
    currentAppender.close();
    // metrics are only valid after the appender is closed
    Metrics metrics = currentAppender.metrics();
    long fileSizeInBytes = currentAppender.length();
    List<Long> splitOffsets = currentAppender.splitOffsets();
    this.currentAppender = null;

    if (metrics.recordCount() == 0L) {
      io.deleteFile(currentFile.encryptingOutputFile());
    } else {
      DataFile dataFile = DataFiles.builder(spec)
          .withEncryptionKeyMetadata(currentFile.keyMetadata())
          .withPath(currentFile.encryptingOutputFile().location())
          .withFileSizeInBytes(fileSizeInBytes)
          .withPartition(spec.fields().size() == 0 ? null : currentKey) // set null if unpartitioned
          .withMetrics(metrics)
          .withSplitOffsets(splitOffsets)
          .build();
      completedFiles.add(dataFile);
    }

    this.currentFile = null;
  }
}

Source File: MetricsWrapper.java From presto with Apache License 2.0

6 votes

@JsonCreator
public MetricsWrapper(
        @JsonProperty("recordCount") Long recordCount,
        @JsonProperty("columnSizes") Map<Integer, Long> columnSizes,
        @JsonProperty("valueCounts") Map<Integer, Long> valueCounts,
        @JsonProperty("nullValueCounts") Map<Integer, Long> nullValueCounts,
        @JsonProperty("lowerBounds") Map<Integer, ByteBuffer> lowerBounds,
        @JsonProperty("upperBounds") Map<Integer, ByteBuffer> upperBounds)
{
    this(new Metrics(
            recordCount,
            columnSizes,
            valueCounts,
            nullValueCounts,
            lowerBounds,
            upperBounds));
}

Source File: TestMetricsWrapper.java From presto with Apache License 2.0

6 votes

@Test
public void testRoundTrip()
{
    Long recordCount = 123L;
    Map<Integer, Long> columnSizes = ImmutableMap.of(3, 321L, 5, 543L);
    Map<Integer, Long> valueCounts = ImmutableMap.of(7, 765L, 9, 987L);
    Map<Integer, Long> nullValueCounts = ImmutableMap.of(2, 234L, 4, 456L);
    Map<Integer, ByteBuffer> lowerBounds = ImmutableMap.of(13, ByteBuffer.wrap(new byte[] {0, 8, 9}));
    Map<Integer, ByteBuffer> upperBounds = ImmutableMap.of(17, ByteBuffer.wrap(new byte[] {5, 4, 0}));

    Metrics expected = new Metrics(recordCount, columnSizes, valueCounts, nullValueCounts, lowerBounds, upperBounds);

    Metrics actual = CODEC.fromJson(CODEC.toJson(new MetricsWrapper(expected))).metrics();

    assertEquals(actual.recordCount(), recordCount);
    assertEquals(actual.columnSizes(), columnSizes);
    assertEquals(actual.valueCounts(), valueCounts);
    assertEquals(actual.nullValueCounts(), nullValueCounts);
    assertEquals(actual.lowerBounds(), lowerBounds);
    assertEquals(actual.upperBounds(), upperBounds);
}

Source File: TestOrcMetrics.java From iceberg with Apache License 2.0

5 votes

@Override
protected <T> void assertBounds(int fieldId, Type type, T lowerBound, T upperBound, Metrics metrics) {
  if (isBinaryType(type)) {
    Assert.assertFalse("ORC binary field should not have lower bounds.",
        metrics.lowerBounds().containsKey(fieldId));
    Assert.assertFalse("ORC binary field should not have upper bounds.",
        metrics.lowerBounds().containsKey(fieldId));
    return;
  }
  super.assertBounds(fieldId, type, lowerBound, upperBound, metrics);
}

Source File: SparkTableUtil.java From iceberg with Apache License 2.0

5 votes

private static List<DataFile> listAvroPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);
    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = new Metrics(-1L, null, null, null);
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("avro")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}

Source File: SparkTableUtil.java From iceberg with Apache License 2.0

5 votes

private static List<DataFile> listOrcPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);

    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = OrcMetrics.fromInputFile(HadoopInputFile.fromPath(stat.getPath(), conf));
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("orc")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}

Source File: OrcMetrics.java From iceberg with Apache License 2.0

5 votes

static Metrics fromWriter(Writer writer) {
  try {
    return buildOrcMetrics(writer.getNumberOfRows(), writer.getSchema(), writer.getStatistics());
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to get statistics from writer");
  }
}

Source File: OrcMetrics.java From iceberg with Apache License 2.0

5 votes

static Metrics fromInputFile(InputFile file, Configuration config) {
  try (Reader orcReader = ORC.newFileReader(file, config)) {
    return buildOrcMetrics(orcReader.getNumberOfRows(), orcReader.getSchema(), orcReader.getStatistics());
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to open file: %s", file.location());
  }
}

Source File: TestMetricsWrapper.java From presto with Apache License 2.0

5 votes

@Test
public void testAllPropertiesHandled()
{
    Set<String> properties = getJsonProperties(MetricsWrapper.class);
    for (Method method : Metrics.class.getMethods()) {
        if (method.getDeclaringClass().equals(Method.class)) {
            assertTrue(properties.contains(method.getName()), "Metrics method not in wrapper: " + method);
        }
    }
}

Source File: ParquetUtil.java From iceberg with Apache License 2.0

5 votes

public static Metrics fileMetrics(InputFile file, MetricsConfig metricsConfig) {
  try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(file))) {
    return footerMetrics(reader.getFooter(), metricsConfig);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to read footer of file: %s", file);
  }
}

Source File: IcebergPageSink.java From presto with Apache License 2.0

5 votes

private Metrics getMetrics(WriteContext writeContext)
{
    switch (fileFormat) {
        case PARQUET:
            return ParquetUtil.fileMetrics(HadoopInputFile.fromPath(writeContext.getPath(), jobConf), MetricsConfig.getDefault());
        case ORC:
            return writeContext.getWriter().getMetrics()
                    .orElseThrow(() -> new VerifyException("Iceberg ORC file writers should return Iceberg metrics"));
    }
    throw new PrestoException(NOT_SUPPORTED, "File format not supported for Iceberg: " + fileFormat);
}

Source File: ParquetWriteAdapter.java From iceberg with Apache License 2.0

4 votes

@Override
public Metrics metrics() {
  Preconditions.checkState(footer != null, "Cannot produce metrics until closed");
  return ParquetUtil.footerMetrics(footer, metricsConfig);
}

Source File: ParquetUtil.java From iceberg with Apache License 2.0

4 votes

static Metrics fileMetrics(InputFile file) {
  return fileMetrics(file, MetricsConfig.getDefault());
}

Source File: ParquetWriter.java From iceberg with Apache License 2.0

4 votes

@Override
public Metrics metrics() {
  return ParquetUtil.footerMetrics(writer.getFooter(), metricsConfig);
}

Source File: AvroFileAppender.java From iceberg with Apache License 2.0

4 votes

@Override
public Metrics metrics() {
  return new Metrics(numRecords, null, null, null);
}

Source File: TestParquetMetrics.java From iceberg with Apache License 2.0

4 votes

@Override
public Metrics getMetrics(InputFile file) {
  return ParquetUtil.fileMetrics(file);
}

Source File: OrcFileAppender.java From iceberg with Apache License 2.0

4 votes

@Override
public Metrics metrics() {
  Preconditions.checkState(isClosed,
      "Cannot return metrics while appending to an open file.");
  return OrcMetrics.fromWriter(writer);
}

Source File: TestOrcMetrics.java From iceberg with Apache License 2.0

4 votes

@Override
public Metrics getMetrics(InputFile file) {
  return OrcMetrics.fromInputFile(file);
}

Source File: OrcMetrics.java From iceberg with Apache License 2.0

4 votes

private static Metrics buildOrcMetrics(final long numOfRows, final TypeDescription orcSchema,
                                       final ColumnStatistics[] colStats) {
  final Schema schema = ORCSchemaUtil.convert(orcSchema);
  final Set<TypeDescription> columnsInContainers = findColumnsInContainers(schema, orcSchema);
  Map<Integer, Long> columnSizes = Maps.newHashMapWithExpectedSize(colStats.length);
  Map<Integer, Long> valueCounts = Maps.newHashMapWithExpectedSize(colStats.length);
  Map<Integer, Long> nullCounts = Maps.newHashMapWithExpectedSize(colStats.length);
  Map<Integer, ByteBuffer> lowerBounds = Maps.newHashMap();
  Map<Integer, ByteBuffer> upperBounds = Maps.newHashMap();

  for (int i = 0; i < colStats.length; i++) {
    final ColumnStatistics colStat = colStats[i];
    final TypeDescription orcCol = orcSchema.findSubtype(i);
    final Optional<Types.NestedField> icebergColOpt = ORCSchemaUtil.icebergID(orcCol)
        .map(schema::findField);

    if (icebergColOpt.isPresent()) {
      final Types.NestedField icebergCol = icebergColOpt.get();
      final int fieldId = icebergCol.fieldId();

      columnSizes.put(fieldId, colStat.getBytesOnDisk());

      if (!columnsInContainers.contains(orcCol)) {
        // Since ORC does not track null values nor repeated ones, the value count for columns in
        // containers (maps, list) may be larger than what it actually is, however these are not
        // used in experssions right now. For such cases, we use the value number of values
        // directly stored in ORC.
        if (colStat.hasNull()) {
          nullCounts.put(fieldId, numOfRows - colStat.getNumberOfValues());
        } else {
          nullCounts.put(fieldId, 0L);
        }
        valueCounts.put(fieldId, colStat.getNumberOfValues() + nullCounts.get(fieldId));

        Optional<ByteBuffer> orcMin = (colStat.getNumberOfValues() > 0) ?
            fromOrcMin(icebergCol, colStat) : Optional.empty();
        orcMin.ifPresent(byteBuffer -> lowerBounds.put(icebergCol.fieldId(), byteBuffer));
        Optional<ByteBuffer> orcMax = (colStat.getNumberOfValues() > 0) ?
            fromOrcMax(icebergCol, colStat) : Optional.empty();
        orcMax.ifPresent(byteBuffer -> upperBounds.put(icebergCol.fieldId(), byteBuffer));
      }
    }
  }

  return new Metrics(numOfRows,
      columnSizes,
      valueCounts,
      nullCounts,
      lowerBounds,
      upperBounds);
}

Source File: OrcMetrics.java From iceberg with Apache License 2.0

4 votes

public static Metrics fromInputFile(InputFile file) {
  final Configuration config = (file instanceof HadoopInputFile) ?
      ((HadoopInputFile) file).getConf() : new Configuration();
  return fromInputFile(file, config);
}

Source File: MetricsWrapper.java From presto with Apache License 2.0

4 votes

public Metrics metrics()
{
    return metrics;
}

Source File: MetricsWrapper.java From presto with Apache License 2.0

4 votes

public MetricsWrapper(Metrics metrics)
{
    this.metrics = requireNonNull(metrics, "metrics is null");
}

Source File: IcebergOrcFileWriter.java From presto with Apache License 2.0

4 votes

private static Metrics computeMetrics(Schema icebergSchema, ColumnMetadata<OrcType> orcColumns, long fileRowCount, Optional<ColumnMetadata<ColumnStatistics>> columnStatistics)
{
    if (columnStatistics.isEmpty()) {
        return new Metrics(fileRowCount, null, null, null, null, null);
    }
    // Columns that are descendants of LIST or MAP types are excluded because:
    // 1. Their stats are not used by Apache Iceberg to filter out data files
    // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
    // See https://github.com/apache/iceberg/pull/199#discussion_r429443627
    Set<OrcColumnId> excludedColumns = getExcludedColumns(orcColumns);

    ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();

    // OrcColumnId(0) is the root column that represents file-level schema
    for (int i = 1; i < orcColumns.size(); i++) {
        OrcColumnId orcColumnId = new OrcColumnId(i);
        if (excludedColumns.contains(orcColumnId)) {
            continue;
        }
        OrcType orcColumn = orcColumns.get(orcColumnId);
        ColumnStatistics orcColumnStats = columnStatistics.get().get(orcColumnId);
        int icebergId = getIcebergId(orcColumn);
        Types.NestedField icebergField = icebergSchema.findField(icebergId);
        verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
        valueCountsBuilder.put(icebergId, fileRowCount);
        if (orcColumnStats.hasNumberOfValues()) {
            nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
        }
        toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
            lowerBoundsBuilder.put(icebergId, minMax.getMin());
            upperBoundsBuilder.put(icebergId, minMax.getMax());
        });
    }
    Map<Integer, Long> valueCounts = valueCountsBuilder.build();
    Map<Integer, Long> nullCounts = nullCountsBuilder.build();
    Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
    Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
    return new Metrics(
            fileRowCount,
            null, // TODO: Add column size accounting to ORC column writers
            valueCounts.isEmpty() ? null : valueCounts,
            nullCounts.isEmpty() ? null : nullCounts,
            lowerBounds.isEmpty() ? null : lowerBounds,
            upperBounds.isEmpty() ? null : upperBounds);
}

Source File: IcebergOrcFileWriter.java From presto with Apache License 2.0

4 votes

@Override
public Optional<Metrics> getMetrics()
{
    return Optional.of(computeMetrics(icebergSchema, orcColumns, orcWriter.getFileRowCount(), orcWriter.getFileStats()));
}

Source File: IcebergRecordFileWriter.java From presto with Apache License 2.0

4 votes

@Override
public Optional<Metrics> getMetrics()
{
    return Optional.empty();
}

Source File: FileAppender.java From iceberg with Apache License 2.0

2 votes

/**
 * @return {@link Metrics} for this file. Only valid after the file is closed.
 */
Metrics metrics();

Source File: IcebergFileWriter.java From presto with Apache License 2.0

votes

Optional<Metrics> getMetrics();

org.apache.iceberg.Metrics Java Examples