Java Code Examples for org.apache.parquet.hadoop.metadata.BlockMetaData#getStartingPos()

The following examples show how to use org.apache.parquet.hadoop.metadata.BlockMetaData#getStartingPos() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param rowGroupMetadata
 * @return true if the mid point of row group is in a new hdfs block, and also move the currentHDFSBlock pointer to the correct index that contains the row group;
 * return false if the mid point of row group is in the same hdfs block
 */
private boolean checkBelongingToANewHDFSBlock(BlockMetaData rowGroupMetadata) {
  boolean isNewHdfsBlock = false;
  long rowGroupMidPoint = rowGroupMetadata.getStartingPos() + (rowGroupMetadata.getCompressedSize() / 2);

  //if mid point is not in the current HDFS block any more, return true
  while (rowGroupMidPoint > getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex)) {
    isNewHdfsBlock = true;
    currentMidPointHDFSBlockIndex++;
    if (currentMidPointHDFSBlockIndex >= hdfsBlocks.length)
      throw new ParquetDecodingException("the row group is not in hdfs blocks in the file: midpoint of row groups is "
              + rowGroupMidPoint
              + ", the end of the hdfs block is "
              + getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex - 1));
  }

  while (rowGroupMetadata.getStartingPos() > getHDFSBlockEndingPosition(currentStartHdfsBlockIndex)) {
    currentStartHdfsBlockIndex++;
    if (currentStartHdfsBlockIndex >= hdfsBlocks.length)
      throw new ParquetDecodingException("The row group does not start in this file: row group offset is "
              + rowGroupMetadata.getStartingPos()
              + " but the end of hdfs blocks of file is "
              + getHDFSBlockEndingPosition(currentStartHdfsBlockIndex));
  }
  return isNewHdfsBlock;
}
 
Example 2
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) {
  long rows = meta.getRowCount();
  long tbs = meta.getTotalByteSize();
  long offset = meta.getStartingPos();

  out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
  out.rule('-');
  showDetails(out, meta.getColumns());
}
 
Example 3
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) {
  long rows = meta.getRowCount();
  long tbs = meta.getTotalByteSize();
  long offset = meta.getStartingPos();

  out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
  out.rule('-');
  showDetails(out, meta.getColumns());
}
 
Example 4
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException {
  MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
  long length = 0;

  for (BlockMetaData block : this.getRowGroups()) {
    List<ColumnChunkMetaData> columns = block.getColumns();
    for (ColumnChunkMetaData column : columns) {
      if (requested.containsPath(column.getPath().toArray())) {
        length += column.getTotalSize();
      }
    }
  }

  BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1);
  long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();

  long[] rowGroupOffsets = new long[this.getRowGroupCount()];
  for (int i = 0; i < rowGroupOffsets.length; i++) {
    rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos();
  }

  return new ParquetInputSplit(
          fileStatus.getPath(),
          hdfsBlock.getOffset(),
          end,
          length,
          hdfsBlock.getHosts(),
          rowGroupOffsets
  );
}
 
Example 5
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void checkSorted(List<BlockMetaData> rowGroupBlocks) {
  long previousOffset = 0L;
  for(BlockMetaData rowGroup: rowGroupBlocks) {
    long currentOffset = rowGroup.getStartingPos();
    if (currentOffset < previousOffset) {
      throw new ParquetDecodingException("row groups are not sorted: previous row groups starts at " + previousOffset + ", current row group starts at " + currentOffset);
    }
  }
}
 
Example 6
Source File: TestParquetWriterAppendBlocks.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testMergedMetadata() throws IOException {
  Path combinedFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, FILE_SCHEMA, combinedFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  ParquetMetadata combinedFooter = ParquetFileReader.readFooter(
      CONF, combinedFile, NO_FILTER);
  ParquetMetadata f1Footer = ParquetFileReader.readFooter(
      CONF, file1, NO_FILTER);
  ParquetMetadata f2Footer = ParquetFileReader.readFooter(
      CONF, file2, NO_FILTER);

  LinkedList<BlockMetaData> expectedRowGroups = new LinkedList<BlockMetaData>();
  expectedRowGroups.addAll(f1Footer.getBlocks());
  expectedRowGroups.addAll(f2Footer.getBlocks());

  Assert.assertEquals("Combined should have the right number of row groups",
      expectedRowGroups.size(),
      combinedFooter.getBlocks().size());

  long nextStart = 4;
  for (BlockMetaData rowGroup : combinedFooter.getBlocks()) {
    BlockMetaData expected = expectedRowGroups.removeFirst();
    Assert.assertEquals("Row count should match",
        expected.getRowCount(), rowGroup.getRowCount());
    Assert.assertEquals("Compressed size should match",
        expected.getCompressedSize(), rowGroup.getCompressedSize());
    Assert.assertEquals("Total size should match",
        expected.getTotalByteSize(), rowGroup.getTotalByteSize());
    Assert.assertEquals("Start pos should be at the last row group's end",
        nextStart, rowGroup.getStartingPos());
    assertColumnsEquivalent(expected.getColumns(), rowGroup.getColumns());
    nextStart = rowGroup.getStartingPos() + rowGroup.getTotalByteSize();
  }
}
 
Example 7
Source File: ParquetMetadataCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void printRowGroup(Logger console, int index, BlockMetaData rowGroup, MessageType schema) {
  long start = rowGroup.getStartingPos();
  long rowCount = rowGroup.getRowCount();
  long compressedSize = rowGroup.getCompressedSize();
  long uncompressedSize = rowGroup.getTotalByteSize();
  String filePath = rowGroup.getPath();

  console.info(String.format("\nRow group %d:  count: %d  %s records  start: %d  total: %s%s\n%s",
      index, rowCount,
      humanReadable(((float) compressedSize) / rowCount),
      start, humanReadable(compressedSize),
      filePath != null ? " path: " + filePath : "",
      new TextStringBuilder(80).appendPadding(80, '-')));

  int size = maxSize(Iterables.transform(rowGroup.getColumns(),
      new Function<ColumnChunkMetaData, String>() {
        @Override
        public String apply(@Nullable ColumnChunkMetaData input) {
          return input == null ? "" : input.getPath().toDotString();
        }
      }));

  console.info(String.format("%-" + size + "s  %-9s %-9s %-9s %-10s %-7s %s",
      "", "type", "encodings", "count", "avg size", "nulls", "min / max"));
  for (ColumnChunkMetaData column : rowGroup.getColumns()) {
    printColumnChunk(console, size, column, schema);
  }
}
 
Example 8
Source File: Metadata.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
private ParquetFileMetadata getParquetFileMetadata(FileAttributes file, AtomicInteger currentNumSplits, long maxSplits) throws IOException {
  final ParquetMetadata metadata =
    SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER, maxFooterLength);
  final int numSplits = currentNumSplits.addAndGet(metadata.getBlocks().size());
  if (numSplits > maxSplits) {
    throw new TooManySplitsException(
      String.format("Too many splits encountered when processing parquet metadata at file %s, maximum is %d but encountered %d splits thus far.",
        file.getPath(), maxSplits, numSplits));
  }

  final MessageType schema = metadata.getFileMetaData().getSchema();

  Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
  schema.getPaths();
  for (String[] path : schema.getPaths()) {
    originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
  }

  List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();

  ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
  ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
  boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
  ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
  if(logger.isDebugEnabled()){
    logger.debug(containsCorruptDates.toString());
  }
  final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
  int rowGroupIdx = 0;
  for (BlockMetaData rowGroup : metadata.getBlocks()) {
    List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
    long length = 0;
    for (ColumnChunkMetaData col : rowGroup.getColumns()) {
      ColumnMetadata columnMetadata;

      // statistics might just have the non-null counts with no min/max they might be
      // initialized to zero instead of null.
      // check statistics actually have non null values (or) column has all nulls.
      boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty()
        && (col.getStatistics().hasNonNullValue()) || col.getStatistics().getNumNulls() ==
        rowGroup.getRowCount());

      Statistics<?> stats = col.getStatistics();
      String[] columnName = col.getPath().toArray();
      SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
      ColumnTypeMetadata columnTypeMetadata =
          new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));

      columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
      if (statsAvailable) {
        // Write stats only if minVal==maxVal. Also, we then store only maxVal
        Object mxValue = null;
        if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
            stats.genericGetMax().equals(stats.genericGetMin())) {
          mxValue = stats.genericGetMax();
          if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
              && columnTypeMetadata.originalType == OriginalType.DATE) {
            mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
          }
        }
        columnMetadata =
            new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
      } else {
        // log it under trace to avoid lot of log entries.
        logger.trace("Stats are not available for column {}, rowGroupIdx {}, file {}",
            columnSchemaName, rowGroupIdx, file.getPath());
        columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
      }
      columnMetadataList.add(columnMetadata);
      length += col.getTotalSize();
    }

    RowGroupMetadata rowGroupMeta =
        new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
            getHostAffinity(fs, file, rowGroup.getStartingPos(), length), columnMetadataList);

    rowGroupMetadataList.add(rowGroupMeta);
    rowGroupIdx++;
  }

  return new ParquetFileMetadata(file, file.size(), rowGroupMetadataList, columnTypeInfo);
}