org.apache.iceberg.Metrics Java Examples
The following examples show how to use
org.apache.iceberg.Metrics.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetRecordWriter.java From dremio-oss with Apache License 2.0 | 6 votes |
private byte[] getIcebergMetaData() throws IOException { if (!this.isIcebergWriter) { return null; } final long fileSize = parquetFileWriter.getPos(); DataFiles.Builder dataFileBuilder = DataFiles.builder(IcebergCatalog.getIcebergPartitionSpec(this.batchSchema, this.partitionColumns)) .withPath(path.toString()) .withFileSizeInBytes(fileSize) .withRecordCount(recordCount) .withFormat(FileFormat.PARQUET); // add partition info if (partitionColumns != null) { dataFileBuilder = dataFileBuilder.withPartition(partition.getIcebergPartitionData()); } // add column level metrics Metrics metrics = footerMetricsToIcebergMetrics(parquetFileWriter.getFooter(), batchSchema); dataFileBuilder = dataFileBuilder.withMetrics(metrics); return IcebergSerDe.serializeDataFile(dataFileBuilder.build()); }
Example #2
Source File: BaseWriter.java From iceberg with Apache License 2.0 | 6 votes |
protected void closeCurrent() throws IOException { if (currentAppender != null) { currentAppender.close(); // metrics are only valid after the appender is closed Metrics metrics = currentAppender.metrics(); long fileSizeInBytes = currentAppender.length(); List<Long> splitOffsets = currentAppender.splitOffsets(); this.currentAppender = null; if (metrics.recordCount() == 0L) { io.deleteFile(currentFile.encryptingOutputFile()); } else { DataFile dataFile = DataFiles.builder(spec) .withEncryptionKeyMetadata(currentFile.keyMetadata()) .withPath(currentFile.encryptingOutputFile().location()) .withFileSizeInBytes(fileSizeInBytes) .withPartition(spec.fields().size() == 0 ? null : currentKey) // set null if unpartitioned .withMetrics(metrics) .withSplitOffsets(splitOffsets) .build(); completedFiles.add(dataFile); } this.currentFile = null; } }
Example #3
Source File: MetricsWrapper.java From presto with Apache License 2.0 | 6 votes |
@JsonCreator public MetricsWrapper( @JsonProperty("recordCount") Long recordCount, @JsonProperty("columnSizes") Map<Integer, Long> columnSizes, @JsonProperty("valueCounts") Map<Integer, Long> valueCounts, @JsonProperty("nullValueCounts") Map<Integer, Long> nullValueCounts, @JsonProperty("lowerBounds") Map<Integer, ByteBuffer> lowerBounds, @JsonProperty("upperBounds") Map<Integer, ByteBuffer> upperBounds) { this(new Metrics( recordCount, columnSizes, valueCounts, nullValueCounts, lowerBounds, upperBounds)); }
Example #4
Source File: TestMetricsWrapper.java From presto with Apache License 2.0 | 6 votes |
@Test public void testRoundTrip() { Long recordCount = 123L; Map<Integer, Long> columnSizes = ImmutableMap.of(3, 321L, 5, 543L); Map<Integer, Long> valueCounts = ImmutableMap.of(7, 765L, 9, 987L); Map<Integer, Long> nullValueCounts = ImmutableMap.of(2, 234L, 4, 456L); Map<Integer, ByteBuffer> lowerBounds = ImmutableMap.of(13, ByteBuffer.wrap(new byte[] {0, 8, 9})); Map<Integer, ByteBuffer> upperBounds = ImmutableMap.of(17, ByteBuffer.wrap(new byte[] {5, 4, 0})); Metrics expected = new Metrics(recordCount, columnSizes, valueCounts, nullValueCounts, lowerBounds, upperBounds); Metrics actual = CODEC.fromJson(CODEC.toJson(new MetricsWrapper(expected))).metrics(); assertEquals(actual.recordCount(), recordCount); assertEquals(actual.columnSizes(), columnSizes); assertEquals(actual.valueCounts(), valueCounts); assertEquals(actual.nullValueCounts(), nullValueCounts); assertEquals(actual.lowerBounds(), lowerBounds); assertEquals(actual.upperBounds(), upperBounds); }
Example #5
Source File: TestOrcMetrics.java From iceberg with Apache License 2.0 | 5 votes |
@Override protected <T> void assertBounds(int fieldId, Type type, T lowerBound, T upperBound, Metrics metrics) { if (isBinaryType(type)) { Assert.assertFalse("ORC binary field should not have lower bounds.", metrics.lowerBounds().containsKey(fieldId)); Assert.assertFalse("ORC binary field should not have upper bounds.", metrics.lowerBounds().containsKey(fieldId)); return; } super.assertBounds(fieldId, type, lowerBound, upperBound, metrics); }
Example #6
Source File: SparkTableUtil.java From iceberg with Apache License 2.0 | 5 votes |
private static List<DataFile> listAvroPartition( Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) { try { Path partition = new Path(partitionUri); FileSystem fs = partition.getFileSystem(conf); return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER)) .filter(FileStatus::isFile) .map(stat -> { Metrics metrics = new Metrics(-1L, null, null, null); String partitionKey = spec.fields().stream() .map(PartitionField::name) .map(name -> String.format("%s=%s", name, partitionPath.get(name))) .collect(Collectors.joining("/")); return DataFiles.builder(spec) .withPath(stat.getPath().toString()) .withFormat("avro") .withFileSizeInBytes(stat.getLen()) .withMetrics(metrics) .withPartitionPath(partitionKey) .build(); }).collect(Collectors.toList()); } catch (IOException e) { throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri); } }
Example #7
Source File: SparkTableUtil.java From iceberg with Apache License 2.0 | 5 votes |
private static List<DataFile> listOrcPartition( Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) { try { Path partition = new Path(partitionUri); FileSystem fs = partition.getFileSystem(conf); return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER)) .filter(FileStatus::isFile) .map(stat -> { Metrics metrics = OrcMetrics.fromInputFile(HadoopInputFile.fromPath(stat.getPath(), conf)); String partitionKey = spec.fields().stream() .map(PartitionField::name) .map(name -> String.format("%s=%s", name, partitionPath.get(name))) .collect(Collectors.joining("/")); return DataFiles.builder(spec) .withPath(stat.getPath().toString()) .withFormat("orc") .withFileSizeInBytes(stat.getLen()) .withMetrics(metrics) .withPartitionPath(partitionKey) .build(); }).collect(Collectors.toList()); } catch (IOException e) { throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri); } }
Example #8
Source File: OrcMetrics.java From iceberg with Apache License 2.0 | 5 votes |
static Metrics fromWriter(Writer writer) { try { return buildOrcMetrics(writer.getNumberOfRows(), writer.getSchema(), writer.getStatistics()); } catch (IOException ioe) { throw new RuntimeIOException(ioe, "Failed to get statistics from writer"); } }
Example #9
Source File: OrcMetrics.java From iceberg with Apache License 2.0 | 5 votes |
static Metrics fromInputFile(InputFile file, Configuration config) { try (Reader orcReader = ORC.newFileReader(file, config)) { return buildOrcMetrics(orcReader.getNumberOfRows(), orcReader.getSchema(), orcReader.getStatistics()); } catch (IOException ioe) { throw new RuntimeIOException(ioe, "Failed to open file: %s", file.location()); } }
Example #10
Source File: TestMetricsWrapper.java From presto with Apache License 2.0 | 5 votes |
@Test public void testAllPropertiesHandled() { Set<String> properties = getJsonProperties(MetricsWrapper.class); for (Method method : Metrics.class.getMethods()) { if (method.getDeclaringClass().equals(Method.class)) { assertTrue(properties.contains(method.getName()), "Metrics method not in wrapper: " + method); } } }
Example #11
Source File: ParquetUtil.java From iceberg with Apache License 2.0 | 5 votes |
public static Metrics fileMetrics(InputFile file, MetricsConfig metricsConfig) { try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(file))) { return footerMetrics(reader.getFooter(), metricsConfig); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to read footer of file: %s", file); } }
Example #12
Source File: IcebergPageSink.java From presto with Apache License 2.0 | 5 votes |
private Metrics getMetrics(WriteContext writeContext) { switch (fileFormat) { case PARQUET: return ParquetUtil.fileMetrics(HadoopInputFile.fromPath(writeContext.getPath(), jobConf), MetricsConfig.getDefault()); case ORC: return writeContext.getWriter().getMetrics() .orElseThrow(() -> new VerifyException("Iceberg ORC file writers should return Iceberg metrics")); } throw new PrestoException(NOT_SUPPORTED, "File format not supported for Iceberg: " + fileFormat); }
Example #13
Source File: ParquetWriteAdapter.java From iceberg with Apache License 2.0 | 4 votes |
@Override public Metrics metrics() { Preconditions.checkState(footer != null, "Cannot produce metrics until closed"); return ParquetUtil.footerMetrics(footer, metricsConfig); }
Example #14
Source File: ParquetUtil.java From iceberg with Apache License 2.0 | 4 votes |
static Metrics fileMetrics(InputFile file) { return fileMetrics(file, MetricsConfig.getDefault()); }
Example #15
Source File: ParquetWriter.java From iceberg with Apache License 2.0 | 4 votes |
@Override public Metrics metrics() { return ParquetUtil.footerMetrics(writer.getFooter(), metricsConfig); }
Example #16
Source File: AvroFileAppender.java From iceberg with Apache License 2.0 | 4 votes |
@Override public Metrics metrics() { return new Metrics(numRecords, null, null, null); }
Example #17
Source File: TestParquetMetrics.java From iceberg with Apache License 2.0 | 4 votes |
@Override public Metrics getMetrics(InputFile file) { return ParquetUtil.fileMetrics(file); }
Example #18
Source File: OrcFileAppender.java From iceberg with Apache License 2.0 | 4 votes |
@Override public Metrics metrics() { Preconditions.checkState(isClosed, "Cannot return metrics while appending to an open file."); return OrcMetrics.fromWriter(writer); }
Example #19
Source File: TestOrcMetrics.java From iceberg with Apache License 2.0 | 4 votes |
@Override public Metrics getMetrics(InputFile file) { return OrcMetrics.fromInputFile(file); }
Example #20
Source File: OrcMetrics.java From iceberg with Apache License 2.0 | 4 votes |
private static Metrics buildOrcMetrics(final long numOfRows, final TypeDescription orcSchema, final ColumnStatistics[] colStats) { final Schema schema = ORCSchemaUtil.convert(orcSchema); final Set<TypeDescription> columnsInContainers = findColumnsInContainers(schema, orcSchema); Map<Integer, Long> columnSizes = Maps.newHashMapWithExpectedSize(colStats.length); Map<Integer, Long> valueCounts = Maps.newHashMapWithExpectedSize(colStats.length); Map<Integer, Long> nullCounts = Maps.newHashMapWithExpectedSize(colStats.length); Map<Integer, ByteBuffer> lowerBounds = Maps.newHashMap(); Map<Integer, ByteBuffer> upperBounds = Maps.newHashMap(); for (int i = 0; i < colStats.length; i++) { final ColumnStatistics colStat = colStats[i]; final TypeDescription orcCol = orcSchema.findSubtype(i); final Optional<Types.NestedField> icebergColOpt = ORCSchemaUtil.icebergID(orcCol) .map(schema::findField); if (icebergColOpt.isPresent()) { final Types.NestedField icebergCol = icebergColOpt.get(); final int fieldId = icebergCol.fieldId(); columnSizes.put(fieldId, colStat.getBytesOnDisk()); if (!columnsInContainers.contains(orcCol)) { // Since ORC does not track null values nor repeated ones, the value count for columns in // containers (maps, list) may be larger than what it actually is, however these are not // used in experssions right now. For such cases, we use the value number of values // directly stored in ORC. if (colStat.hasNull()) { nullCounts.put(fieldId, numOfRows - colStat.getNumberOfValues()); } else { nullCounts.put(fieldId, 0L); } valueCounts.put(fieldId, colStat.getNumberOfValues() + nullCounts.get(fieldId)); Optional<ByteBuffer> orcMin = (colStat.getNumberOfValues() > 0) ? fromOrcMin(icebergCol, colStat) : Optional.empty(); orcMin.ifPresent(byteBuffer -> lowerBounds.put(icebergCol.fieldId(), byteBuffer)); Optional<ByteBuffer> orcMax = (colStat.getNumberOfValues() > 0) ? fromOrcMax(icebergCol, colStat) : Optional.empty(); orcMax.ifPresent(byteBuffer -> upperBounds.put(icebergCol.fieldId(), byteBuffer)); } } } return new Metrics(numOfRows, columnSizes, valueCounts, nullCounts, lowerBounds, upperBounds); }
Example #21
Source File: OrcMetrics.java From iceberg with Apache License 2.0 | 4 votes |
public static Metrics fromInputFile(InputFile file) { final Configuration config = (file instanceof HadoopInputFile) ? ((HadoopInputFile) file).getConf() : new Configuration(); return fromInputFile(file, config); }
Example #22
Source File: MetricsWrapper.java From presto with Apache License 2.0 | 4 votes |
public Metrics metrics() { return metrics; }
Example #23
Source File: MetricsWrapper.java From presto with Apache License 2.0 | 4 votes |
public MetricsWrapper(Metrics metrics) { this.metrics = requireNonNull(metrics, "metrics is null"); }
Example #24
Source File: IcebergOrcFileWriter.java From presto with Apache License 2.0 | 4 votes |
private static Metrics computeMetrics(Schema icebergSchema, ColumnMetadata<OrcType> orcColumns, long fileRowCount, Optional<ColumnMetadata<ColumnStatistics>> columnStatistics) { if (columnStatistics.isEmpty()) { return new Metrics(fileRowCount, null, null, null, null, null); } // Columns that are descendants of LIST or MAP types are excluded because: // 1. Their stats are not used by Apache Iceberg to filter out data files // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them. // See https://github.com/apache/iceberg/pull/199#discussion_r429443627 Set<OrcColumnId> excludedColumns = getExcludedColumns(orcColumns); ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder(); ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder(); ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder(); ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder(); // OrcColumnId(0) is the root column that represents file-level schema for (int i = 1; i < orcColumns.size(); i++) { OrcColumnId orcColumnId = new OrcColumnId(i); if (excludedColumns.contains(orcColumnId)) { continue; } OrcType orcColumn = orcColumns.get(orcColumnId); ColumnStatistics orcColumnStats = columnStatistics.get().get(orcColumnId); int icebergId = getIcebergId(orcColumn); Types.NestedField icebergField = icebergSchema.findField(icebergId); verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema); valueCountsBuilder.put(icebergId, fileRowCount); if (orcColumnStats.hasNumberOfValues()) { nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues()); } toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> { lowerBoundsBuilder.put(icebergId, minMax.getMin()); upperBoundsBuilder.put(icebergId, minMax.getMax()); }); } Map<Integer, Long> valueCounts = valueCountsBuilder.build(); Map<Integer, Long> nullCounts = nullCountsBuilder.build(); Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build(); Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build(); return new Metrics( fileRowCount, null, // TODO: Add column size accounting to ORC column writers valueCounts.isEmpty() ? null : valueCounts, nullCounts.isEmpty() ? null : nullCounts, lowerBounds.isEmpty() ? null : lowerBounds, upperBounds.isEmpty() ? null : upperBounds); }
Example #25
Source File: IcebergOrcFileWriter.java From presto with Apache License 2.0 | 4 votes |
@Override public Optional<Metrics> getMetrics() { return Optional.of(computeMetrics(icebergSchema, orcColumns, orcWriter.getFileRowCount(), orcWriter.getFileStats())); }
Example #26
Source File: IcebergRecordFileWriter.java From presto with Apache License 2.0 | 4 votes |
@Override public Optional<Metrics> getMetrics() { return Optional.empty(); }
Example #27
Source File: FileAppender.java From iceberg with Apache License 2.0 | 2 votes |
/** * @return {@link Metrics} for this file. Only valid after the file is closed. */ Metrics metrics();
Example #28
Source File: IcebergFileWriter.java From presto with Apache License 2.0 | votes |
Optional<Metrics> getMetrics();