org.apache.parquet.hadoop.metadata.BlockMetaData Java Examples

The following examples show how to use org.apache.parquet.hadoop.metadata.BlockMetaData. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetRecordReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void checkDeltaByteArrayProblem(FileMetaData meta, Configuration conf, BlockMetaData block) {
  // splitting files?
  if (conf.getBoolean(ParquetInputFormat.SPLIT_FILES, true)) {
    // this is okay if not using DELTA_BYTE_ARRAY with the bug
    Set<Encoding> encodings = new HashSet<Encoding>();
    for (ColumnChunkMetaData column : block.getColumns()) {
      encodings.addAll(column.getEncodings());
    }
    for (Encoding encoding : encodings) {
      if (CorruptDeltaByteArrays.requiresSequentialReads(meta.getCreatedBy(), encoding)) {
        throw new ParquetDecodingException("Cannot read data due to " +
            "PARQUET-246: to read safely, set " + SPLIT_FILES + " to false");
      }
    }
  }
}
 
Example #2
Source File: ParquetReaderUtility.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
public static List<Integer> getRowGroupNumbersFromFileSplit(final long splitStart, final long splitLength,
                                                             final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();
  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}
 
Example #3
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param configuration the Hadoop conf
 * @param fileMetaData fileMetaData for parquet file
 * @param filePath Path for the parquet file
 * @param blocks the blocks to read
 * @param columns the columns to read (their path)
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(
    Configuration configuration, FileMetaData fileMetaData,
    Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException {
  this.converter = new ParquetMetadataConverter(configuration);
  this.file = HadoopInputFile.fromPath(filePath, configuration);
  this.fileMetaData = fileMetaData;
  this.f = file.newStream();
  this.options = HadoopReadOptions.builder(configuration).build();
  this.blocks = filterRowGroups(blocks);
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : columns) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}
 
Example #4
Source File: TestMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testZeroRecordFile() {
  BlockMetaData emptyBlock = new BlockMetaData();
  emptyBlock.setRowCount(0);

  Expression[] exprs = new Expression[] {
      lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78),
      greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"),
      notNull("some_nulls")
  };

  for (Expression expr : exprs) {
    boolean shouldRead = new ParquetMetricsRowGroupFilter(SCHEMA, expr)
        .shouldRead(PARQUET_SCHEMA, emptyBlock);
    Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead);
  }
}
 
Example #5
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) {
  if (rowGroup.getRowCount() <= 0) {
    return ROWS_CANNOT_MATCH;
  }

  this.stats = Maps.newHashMap();
  this.valueCounts = Maps.newHashMap();
  this.conversions = Maps.newHashMap();
  for (ColumnChunkMetaData col : rowGroup.getColumns()) {
    PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType();
    if (colType.getId() != null) {
      int id = colType.getId().intValue();
      stats.put(id, col.getStatistics());
      valueCounts.put(id, col.getValueCount());
      conversions.put(id, converterFromParquet(colType));
    }
  }

  return ExpressionVisitors.visit(expr, this);
}
 
Example #6
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
  List<BlockMetaData> blocks = parquetMetadata.getBlocks();
  List<RowGroup> rowGroups = new ArrayList<RowGroup>();
  long numRows = 0;
  for (BlockMetaData block : blocks) {
    numRows += block.getRowCount();
    addRowGroup(parquetMetadata, rowGroups, block);
  }
  FileMetaData fileMetaData = new FileMetaData(
      currentVersion,
      toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
      numRows,
      rowGroups);

  Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
  for (Entry<String, String> keyValue : keyValues) {
    addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
  }

  fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());

  fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema()));

  return fileMetaData;
}
 
Example #7
Source File: InternalParquetRecordReader.java    From tajo with Apache License 2.0 6 votes vote down vote up
public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
 
Example #8
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void serializeColumnIndexes(
    List<List<ColumnIndex>> columnIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: column indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter
          .toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
      if (columnIndex == null) {
        continue;
      }
      long offset = out.getPos();
      Util.writeColumnIndex(columnIndex, out);
      column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}
 
Example #9
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void serializeBloomFilters(
  List<Map<String, BloomFilter>> bloomFilters,
  List<BlockMetaData> blocks,
  PositionOutputStream out) throws IOException {
  LOG.debug("{}: bloom filters", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
    if (blockBloomFilters.isEmpty()) continue;
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
      if (bloomFilter == null) {
        continue;
      }

      long offset = out.getPos();
      column.setBloomFilterOffset(offset);
      Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out);
      bloomFilter.writeTo(out);
    }
  }
}
 
Example #10
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Given a list of metadata files, merge them into a single ParquetMetadata
 * Requires that the schemas be compatible, and the extraMetadata be exactly equal.
 * @param files a list of files to merge metadata from
 * @param conf a configuration
 * @return merged parquet metadata for the files
 * @throws IOException if there is an error while writing
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files,  Configuration conf) throws IOException {
  Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");

  GlobalMetaData globalMetaData = null;
  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();

  for (Path p : files) {
    ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
    FileMetaData fmd = pmd.getFileMetaData();
    globalMetaData = mergeInto(fmd, globalMetaData, true);
    blocks.addAll(pmd.getBlocks());
  }

  // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
  return new ParquetMetadata(globalMetaData.merge(), blocks);
}
 
Example #11
Source File: ParquetReader.java    From tajo with Apache License 2.0 6 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      Filter filter) throws IOException {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
  this.conf = conf;

  FileSystem fs = file.getFileSystem(conf);
  List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
  List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
  this.footersIterator = footers.iterator();

  for (Footer footer : footers) {
    for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      totalRowCount += block.getRowCount();
    }
  }
}
 
Example #12
Source File: ReadConf.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private List<Map<ColumnPath, ColumnChunkMetaData>> getColumnChunkMetadataForRowGroups() {
  Set<ColumnPath> projectedColumns = projection.getColumns().stream()
      .map(columnDescriptor -> ColumnPath.get(columnDescriptor.getPath())).collect(Collectors.toSet());
  ImmutableList.Builder<Map<ColumnPath, ColumnChunkMetaData>> listBuilder = ImmutableList.builder();
  for (int i = 0; i < rowGroups.size(); i++) {
    if (!shouldSkip[i]) {
      BlockMetaData blockMetaData = rowGroups.get(i);
      ImmutableMap.Builder<ColumnPath, ColumnChunkMetaData> mapBuilder = ImmutableMap.builder();
      blockMetaData.getColumns().stream()
          .filter(columnChunkMetaData -> projectedColumns.contains(columnChunkMetaData.getPath()))
          .forEach(columnChunkMetaData -> mapBuilder.put(columnChunkMetaData.getPath(), columnChunkMetaData));
      listBuilder.add(mapBuilder.build());
    } else {
      listBuilder.add(ImmutableMap.of());
    }
  }
  return listBuilder.build();
}
 
Example #13
Source File: PrintFooter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void add(ParquetMetadata footer) {
  for (BlockMetaData blockMetaData : footer.getBlocks()) {
    ++ blockCount;
    MessageType schema = footer.getFileMetaData().getSchema();
    recordCount += blockMetaData.getRowCount();
    List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
      add(
          desc,
          columnMetaData.getValueCount(),
          columnMetaData.getTotalSize(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getEncodings(),
          columnMetaData.getStatistics());
    }
  }
}
 
Example #14
Source File: TestMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Test
public void testZeroRecordFileParquet() {
  Assume.assumeTrue(format == FileFormat.PARQUET);
  BlockMetaData emptyBlock = new BlockMetaData();
  emptyBlock.setRowCount(0);

  Expression[] exprs = new Expression[] {
      lessThan("id", 5), lessThanOrEqual("id", 30), equal("id", 70), greaterThan("id", 78),
      greaterThanOrEqual("id", 90), notEqual("id", 101), isNull("some_nulls"),
      notNull("some_nulls")
  };

  for (Expression expr : exprs) {
    boolean shouldRead = shouldReadParquet(expr, true, parquetSchema, emptyBlock);
    Assert.assertFalse("Should never read 0-record file: " + expr, shouldRead);
  }
}
 
Example #15
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * groups together all the data blocks for the same HDFS block
 *
 * @param rowGroupBlocks      data blocks (row groups)
 * @param hdfsBlocksArray     hdfs blocks
 * @param fileStatus          the containing file
 * @param requestedSchema     the schema requested by the user
 * @param readSupportMetadata the metadata provided by the readSupport implementation in init
 * @param minSplitSize        the mapred.min.split.size
 * @param maxSplitSize        the mapred.max.split.size
 * @return the splits (one per HDFS block)
 * @throws IOException If hosts can't be retrieved for the HDFS block
 */
static <T> List<ParquetInputSplit> generateSplits(
        List<BlockMetaData> rowGroupBlocks,
        BlockLocation[] hdfsBlocksArray,
        FileStatus fileStatus,
        String requestedSchema,
        Map<String, String> readSupportMetadata, long minSplitSize, long maxSplitSize) throws IOException {

  List<SplitInfo> splitRowGroups =
      generateSplitInfo(rowGroupBlocks, hdfsBlocksArray, minSplitSize, maxSplitSize);

  //generate splits from rowGroups of each split
  List<ParquetInputSplit> resultSplits = new ArrayList<ParquetInputSplit>();
  for (SplitInfo splitInfo : splitRowGroups) {
    ParquetInputSplit split = splitInfo.getParquetInputSplit(fileStatus, requestedSchema, readSupportMetadata);
    resultSplits.add(split);
  }
  return resultSplits;
}
 
Example #16
Source File: PredicateUtils.java    From presto with Apache License 2.0 6 votes vote down vote up
private static boolean dictionaryPredicatesMatch(Predicate parquetPredicate, BlockMetaData blockMetadata, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain)
{
    for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) {
        RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray()));
        if (descriptor != null) {
            if (isOnlyDictionaryEncodingPages(columnMetaData) && isColumnPredicate(descriptor, parquetTupleDomain)) {
                byte[] buffer = new byte[toIntExact(columnMetaData.getTotalSize())];
                dataSource.readFully(columnMetaData.getStartingPos(), buffer);
                //  Early abort, predicate already filters block so no more dictionaries need be read
                if (!parquetPredicate.matches(new DictionaryDescriptor(descriptor, readDictionaryPage(buffer, columnMetaData.getCodec())))) {
                    return false;
                }
            }
        }
    }
    return true;
}
 
Example #17
Source File: CompressionConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema,
                           String createdBy, CompressionCodecName codecName) throws IOException {
  int blockIndex = 0;
  PageReadStore store = reader.readNextRowGroup();
  while (store != null) {
    writer.startBlock(store.getRowCount());
    BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex);
    List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns();
    Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(
      Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
      ColumnChunkMetaData chunk = columnsInOrder.get(i);
      ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy);
      ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath());
      writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName);
      processChunk(reader, writer, chunk, createdBy, codecName);
      writer.endColumn();
    }
    writer.endBlock();
    store = reader.readNextRowGroup();
    blockIndex++;
  }
}
 
Example #18
Source File: ParquetInputSplit.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static long end(List<BlockMetaData> blocks, String requestedSchema) {
  MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
  long length = 0;

  for (BlockMetaData block : blocks) {
    List<ColumnChunkMetaData> columns = block.getColumns();
    for (ColumnChunkMetaData column : columns) {
      if (requested.containsPath(column.getPath().toArray())) {
        length += column.getTotalSize();
      }
    }
  }
  return length;
}
 
Example #19
Source File: ColumnIndexStoreImpl.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
static ColumnIndexStore create(ParquetFileReader reader, BlockMetaData block, Set<ColumnPath> paths) {
  try {
    return new ColumnIndexStoreImpl(reader, block, paths);
  } catch (MissingOffsetIndexException e) {
    return EMPTY;
  }
}
 
Example #20
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException {
  MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
  long length = 0;

  for (BlockMetaData block : this.getRowGroups()) {
    List<ColumnChunkMetaData> columns = block.getColumns();
    for (ColumnChunkMetaData column : columns) {
      if (requested.containsPath(column.getPath().toArray())) {
        length += column.getTotalSize();
      }
    }
  }

  BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1);
  long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();

  long[] rowGroupOffsets = new long[this.getRowGroupCount()];
  for (int i = 0; i < rowGroupOffsets.length; i++) {
    rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos();
  }

  return new ParquetInputSplit(
          fileStatus.getPath(),
          hdfsBlock.getOffset(),
          end,
          length,
          hdfsBlock.getHosts(),
          rowGroupOffsets
  );
}
 
Example #21
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void checkSorted(List<BlockMetaData> rowGroupBlocks) {
  long previousOffset = 0L;
  for(BlockMetaData rowGroup: rowGroupBlocks) {
    long currentOffset = rowGroup.getStartingPos();
    if (currentOffset < previousOffset) {
      throw new ParquetDecodingException("row groups are not sorted: previous row groups starts at " + previousOffset + ", current row group starts at " + currentOffset);
    }
  }
}
 
Example #22
Source File: BloomFilterReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public BloomFilterReader(ParquetFileReader fileReader, BlockMetaData block) {
  this.reader = fileReader;
  this.columns = new HashMap<>();
  for (ColumnChunkMetaData column : block.getColumns()) {
    columns.put(column.getPath(), column);
  }
}
 
Example #23
Source File: ParquetInputSplit.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * For compatibility only
 * use {@link ParquetInputSplit#ParquetInputSplit(Path, long, long, long, String[], long[])}
 * @param path a Path
 * @param start split start location
 * @param length split length
 * @param hosts locality information for this split
 * @param blocks Parquet blocks in this split
 * @param requestedSchema the requested schema
 * @param fileSchema the file schema
 * @param extraMetadata string map of file metadata
 * @param readSupportMetadata string map of metadata from read support
 */
@Deprecated
public ParquetInputSplit(
    Path path,
    long start,
    long length,
    String[] hosts,
    List<BlockMetaData> blocks,
    String requestedSchema,
    String fileSchema,
    Map<String, String> extraMetadata,
    Map<String, String> readSupportMetadata) {
  this(path, start, end(blocks, requestedSchema), length, hosts, offsets(blocks));
}
 
Example #24
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
static void showDetails(PrettyPrintWriter out, ParquetMetadata meta, boolean showOriginalTypes) {
  showDetails(out, meta.getFileMetaData(), showOriginalTypes);

  long i = 1;
  for (BlockMetaData bmeta : meta.getBlocks()) {
    out.println();
    showDetails(out, bmeta, i++);
  }
}
 
Example #25
Source File: ColumnSizeCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException {
  Map<String, Long> colSizes = new HashMap<>();
  ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER);

  for (BlockMetaData block : pmd.getBlocks()) {
    for (ColumnChunkMetaData column : block.getColumns()) {
      String colName = column.getPath().toDotString();
      colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L));
    }
  }

  return colSizes;
}
 
Example #26
Source File: RowCountCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  out = new PrintWriter(Main.out, true);
  inputPath = new Path(input);
  conf = new Configuration();
  inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
  long rowCount = 0;

  for (FileStatus fs : inputFileStatuses) {
    long fileRowCount=0;
    for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
      for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
        rowCount += b.getRowCount();
        fileRowCount += b.getRowCount();
      }
    }
    if (options.hasOption('d')) {
      out.format("%s row count: %d\n", fs.getPath().getName(), fileRowCount);
    }
  }

  out.format("Total RowCount: %d", rowCount);
  out.println();
}
 
Example #27
Source File: ColumnSizeCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException {
  Map<String, Long> colSizes = new HashMap<>();
  ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER);

  for (BlockMetaData block : pmd.getBlocks()) {
    for (ColumnChunkMetaData column : block.getColumns()) {
      String colName = column.getPath().toDotString();
      colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L));
    }
  }

  return colSizes;
}
 
Example #28
Source File: ReadState.java    From Bats with Apache License 2.0 5 votes vote down vote up
/**
 * Create the readers needed to read columns: fixed-length or variable length.
 *
 * @param reader
 * @param output
 * @throws Exception
 */

@SuppressWarnings("unchecked")
public void buildReader(ParquetRecordReader reader, OutputMutator output) throws Exception {
  final ArrayList<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>();
  // initialize all of the column read status objects
  BlockMetaData rowGroupMetadata = schema.getRowGroupMetadata();
  Map<String, Integer> columnChunkMetadataPositionsInList = schema.buildChunkMap(rowGroupMetadata);
  for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) {
    ColumnDescriptor column = columnMetadata.column;
    columnMetadata.columnChunkMetaData = rowGroupMetadata.getColumns().get(
                    columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath())));
    columnMetadata.buildVector(output);
    if (! columnMetadata.isFixedLength( )) {
      // create a reader and add it to the appropriate list
      varLengthColumns.add(columnMetadata.makeVariableWidthReader(reader));
    } else if (columnMetadata.isRepeated()) {
      varLengthColumns.add(columnMetadata.makeRepeatedFixedWidthReader(reader));
    }
    else {
      fixedLenColumnReaders.add(columnMetadata.makeFixedWidthReader(reader));
    }
  }
  varLengthReader = new VarLenBinaryReader(reader, varLengthColumns);
  if (! schema.isStarQuery()) {
    schema.createNonExistentColumns(output, nullFilledVectors);
  }
}
 
Example #29
Source File: SizeCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  out = new PrintWriter(Main.out, true);
  inputPath = new Path(input);
  conf = new Configuration();
  inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
  long size = 0;
  for (FileStatus fs : inputFileStatuses) {
    long fileSize = 0;
    for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
      for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
        size += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
        fileSize += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
      }
    }
    if (options.hasOption('d')) {
      if (options.hasOption('p')) {
        out.format("%s: %s\n", fs.getPath().getName(), getPrettySize(fileSize));
      }
      else {
        out.format("%s: %d bytes\n", fs.getPath().getName(), fileSize);
      }
    }
  }

  if (options.hasOption('p')) {
    out.format("Total Size: %s", getPrettySize(size));
  }
  else {
    out.format("Total Size: %d bytes", size);
  }
  out.println();
}
 
Example #30
Source File: DictionaryPageReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Instantiate a new DictionaryPageReader.
 *
 * @param reader The target ParquetFileReader
 * @param block The target BlockMetaData
 *
 * @throws NullPointerException if {@code reader} or {@code block} is
 *           {@code null}
 */
DictionaryPageReader(ParquetFileReader reader, BlockMetaData block) {
  this.reader = Objects.requireNonNull(reader);
  this.columns = new HashMap<>();
  this.dictionaryPageCache = new ConcurrentHashMap<>();

  for (ColumnChunkMetaData column : block.getColumns()) {
    columns.put(column.getPath().toDotString(), column);
  }
}