Java Code Examples for org.apache.parquet.hadoop.metadata.BlockMetaData#getRowCount()

The following examples show how to use org.apache.parquet.hadoop.metadata.BlockMetaData#getRowCount() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

6 votes

private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) {
  if (rowGroup.getRowCount() <= 0) {
    return ROWS_CANNOT_MATCH;
  }

  this.stats = Maps.newHashMap();
  this.valueCounts = Maps.newHashMap();
  this.conversions = Maps.newHashMap();
  for (ColumnChunkMetaData col : rowGroup.getColumns()) {
    PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType();
    if (colType.getId() != null) {
      int id = colType.getId().intValue();
      stats.put(id, col.getStatistics());
      valueCounts.put(id, col.getValueCount());
      conversions.put(id, ParquetConversions.converterFromParquet(colType));
    }
  }

  return ExpressionVisitors.visitEvaluator(expr, this);
}

Example 2

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

6 votes

private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) {
  if (rowGroup.getRowCount() <= 0) {
    return ROWS_CANNOT_MATCH;
  }

  this.stats = Maps.newHashMap();
  this.valueCounts = Maps.newHashMap();
  this.conversions = Maps.newHashMap();
  for (ColumnChunkMetaData col : rowGroup.getColumns()) {
    PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType();
    if (colType.getId() != null) {
      int id = colType.getId().intValue();
      stats.put(id, col.getStatistics());
      valueCounts.put(id, col.getValueCount());
      conversions.put(id, converterFromParquet(colType));
    }
  }

  return ExpressionVisitors.visit(expr, this);
}

Example 3

Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0

6 votes

public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
  List<BlockMetaData> blocks = parquetMetadata.getBlocks();
  List<RowGroup> rowGroups = new ArrayList<RowGroup>();
  long numRows = 0;
  for (BlockMetaData block : blocks) {
    numRows += block.getRowCount();
    addRowGroup(parquetMetadata, rowGroups, block);
  }
  FileMetaData fileMetaData = new FileMetaData(
      currentVersion,
      toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
      numRows,
      rowGroups);

  Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
  for (Entry<String, String> keyValue : keyValues) {
    addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
  }

  fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());

  fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema()));

  return fileMetaData;
}

Example 4

Source File: InternalParquetRecordReader.java From tajo with Apache License 2.0

6 votes

public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

Example 5

Source File: ParquetReader.java From tajo with Apache License 2.0

6 votes

private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      Filter filter) throws IOException {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
  this.conf = conf;

  FileSystem fs = file.getFileSystem(conf);
  List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
  List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
  this.footersIterator = footers.iterator();

  for (Footer footer : footers) {
    for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      totalRowCount += block.getRowCount();
    }
  }
}

Example 6

Source File: PrintFooter.java From parquet-mr with Apache License 2.0

6 votes

private static void add(ParquetMetadata footer) {
  for (BlockMetaData blockMetaData : footer.getBlocks()) {
    ++ blockCount;
    MessageType schema = footer.getFileMetaData().getSchema();
    recordCount += blockMetaData.getRowCount();
    List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
      add(
          desc,
          columnMetaData.getValueCount(),
          columnMetaData.getTotalSize(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getEncodings(),
          columnMetaData.getStatistics());
    }
  }
}

Example 7

Source File: TestConvertAvroToParquet.java From nifi with Apache License 2.0

5 votes

@Test
public void test_Meta_Info() throws Exception {

    FileInputStream fileInputStream = new FileInputStream(tmpAvro);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    int readedBytes;
    byte[] buf = new byte[1024];
    while ((readedBytes = fileInputStream.read(buf)) > 0) {
        out.write(buf, 0, readedBytes);
    }
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToParquet.SUCCESS).get(0);

    // Save the flowfile
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream(tmpParquet);
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    ParquetMetadata metaData;
    metaData = ParquetFileReader.readFooter(conf, new Path(tmpParquet.getAbsolutePath()), NO_FILTER);

    // #number of records
    long nParquetRecords = 0;
    for(BlockMetaData meta : metaData.getBlocks()){
        nParquetRecords += meta.getRowCount();
    }
    long nAvroRecord = records.size();

    assertEquals(nParquetRecords, nAvroRecord);
}

Example 8

Source File: ParquetMetadataCommand.java From parquet-mr with Apache License 2.0

5 votes

private void printRowGroup(Logger console, int index, BlockMetaData rowGroup, MessageType schema) {
  long start = rowGroup.getStartingPos();
  long rowCount = rowGroup.getRowCount();
  long compressedSize = rowGroup.getCompressedSize();
  long uncompressedSize = rowGroup.getTotalByteSize();
  String filePath = rowGroup.getPath();

  console.info(String.format("\nRow group %d:  count: %d  %s records  start: %d  total: %s%s\n%s",
      index, rowCount,
      humanReadable(((float) compressedSize) / rowCount),
      start, humanReadable(compressedSize),
      filePath != null ? " path: " + filePath : "",
      new TextStringBuilder(80).appendPadding(80, '-')));

  int size = maxSize(Iterables.transform(rowGroup.getColumns(),
      new Function<ColumnChunkMetaData, String>() {
        @Override
        public String apply(@Nullable ColumnChunkMetaData input) {
          return input == null ? "" : input.getPath().toDotString();
        }
      }));

  console.info(String.format("%-" + size + "s  %-9s %-9s %-9s %-10s %-7s %s",
      "", "type", "encodings", "count", "avg size", "nulls", "min / max"));
  for (ColumnChunkMetaData column : rowGroup.getColumns()) {
    printColumnChunk(console, size, column, schema);
  }
}

Example 9

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

public long getRecordCount() {
  long total = 0;
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  return total;
}

Example 10

Source File: RowCountCommand.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  out = new PrintWriter(Main.out, true);
  inputPath = new Path(input);
  conf = new Configuration();
  inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
  long rowCount = 0;

  for (FileStatus fs : inputFileStatuses) {
    long fileRowCount=0;
    for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
      for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
        rowCount += b.getRowCount();
        fileRowCount += b.getRowCount();
      }
    }
    if (options.hasOption('d')) {
      out.format("%s row count: %d\n", fs.getPath().getName(), fileRowCount);
    }
  }

  out.format("Total RowCount: %d", rowCount);
  out.println();
}

Example 11

Source File: MetadataUtils.java From parquet-mr with Apache License 2.0

5 votes

private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) {
  long rows = meta.getRowCount();
  long tbs = meta.getTotalByteSize();
  long offset = meta.getStartingPos();

  out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
  out.rule('-');
  showDetails(out, meta.getColumns());
}

Example 12

Source File: MetadataUtils.java From parquet-mr with Apache License 2.0

5 votes

private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) {
  long rows = meta.getRowCount();
  long tbs = meta.getTotalByteSize();
  long offset = meta.getStartingPos();

  out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
  out.rule('-');
  showDetails(out, meta.getColumns());
}

Example 13

Source File: ParquetColumnarRowSplitReader.java From flink with Apache License 2.0

5 votes

/**
 * Seek to a particular row number.
 */
public void seekToRow(long rowCount) throws IOException {
	if (totalCountLoadedSoFar != 0) {
		throw new UnsupportedOperationException("Only support seek at first.");
	}

	List<BlockMetaData> blockMetaData = reader.getRowGroups();

	for (BlockMetaData metaData : blockMetaData) {
		if (metaData.getRowCount() > rowCount) {
			break;
		} else {
			reader.skipNextRowGroup();
			rowsReturned += metaData.getRowCount();
			totalCountLoadedSoFar += metaData.getRowCount();
			rowsInBatch = (int) metaData.getRowCount();
			nextRow = (int) metaData.getRowCount();
			rowCount -= metaData.getRowCount();
		}
	}
	for (int i = 0; i < rowCount; i++) {
		boolean end = reachedEnd();
		if (end) {
			throw new RuntimeException("Seek to many rows.");
		}
		nextRecord();
	}
}

Example 14

Source File: ParquetColumnarRowSplitReader.java From flink with Apache License 2.0

5 votes

public ParquetColumnarRowSplitReader(
		boolean utcTimestamp,
		boolean caseSensitive,
		Configuration conf,
		LogicalType[] selectedTypes,
		String[] selectedFieldNames,
		ColumnBatchGenerator generator,
		int batchSize,
		Path path,
		long splitStart,
		long splitLength) throws IOException {
	this.utcTimestamp = utcTimestamp;
	this.selectedTypes = selectedTypes;
	this.batchSize = batchSize;
	// then we need to apply the predicate push down filter
	ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength));
	MessageType fileSchema = footer.getFileMetaData().getSchema();
	FilterCompat.Filter filter = getFilter(conf);
	List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);

	this.fileSchema = footer.getFileMetaData().getSchema();
	this.requestedSchema = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive);
	this.reader = new ParquetFileReader(
			conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());

	long totalRowCount = 0;
	for (BlockMetaData block : blocks) {
		totalRowCount += block.getRowCount();
	}
	this.totalRowCount = totalRowCount;
	this.nextRow = 0;
	this.rowsInBatch = 0;
	this.rowsReturned = 0;

	checkSchema();

	this.writableVectors = createWritableVectors();
	this.columnarBatch = generator.generate(createReadableVectors());
	this.row = new ColumnarRowData(columnarBatch);
}

Example 15

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

4 votes

/**
 * Reads all the columns requested from the row group at the current file position.
 * @throws IOException if an error occurs while reading
 * @return the PageReadStore which can provide PageReaders for each column.
 */
public PageReadStore readNextRowGroup() throws IOException {
  if (currentBlock == blocks.size()) {
    return null;
  }
  BlockMetaData block = blocks.get(currentBlock);
  if (block.getRowCount() == 0) {
    throw new RuntimeException("Illegal row group of 0 rows");
  }
  this.currentRowGroup = new ColumnChunkPageReadStore(block.getRowCount());
  // prepare the list of consecutive parts to read them in one scan
  List<ConsecutivePartList> allParts = new ArrayList<ConsecutivePartList>();
  ConsecutivePartList currentParts = null;
  for (ColumnChunkMetaData mc : block.getColumns()) {
    ColumnPath pathKey = mc.getPath();
    BenchmarkCounter.incrementTotalBytes(mc.getTotalSize());
    ColumnDescriptor columnDescriptor = paths.get(pathKey);
    if (columnDescriptor != null) {
      long startingPos = mc.getStartingPos();
      // first part or not consecutive => new list
      if (currentParts == null || currentParts.endPos() != startingPos) {
        currentParts = new ConsecutivePartList(startingPos);
        allParts.add(currentParts);
      }
      currentParts.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int)mc.getTotalSize()));
    }
  }
  // actually read all the chunks
  ChunkListBuilder builder = new ChunkListBuilder();
  for (ConsecutivePartList consecutiveChunks : allParts) {
    consecutiveChunks.readAll(f, builder);
  }
  for (Chunk chunk : builder.build()) {
    currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
  }

  // avoid re-reading bytes the dictionary reader is used after this call
  if (nextDictionaryReader != null) {
    nextDictionaryReader.setRowGroup(currentRowGroup);
  }

  advanceToNextBlock();

  return currentRowGroup;
}

Example 16

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

4 votes

/**
 * Reads all the columns requested from the row group at the current file position. It may skip specific pages based
 * on the column indexes according to the actual filter. As the rows are not aligned among the pages of the different
 * columns row synchronization might be required. See the documentation of the class SynchronizingColumnReader for
 * details.
 *
 * @return the PageReadStore which can provide PageReaders for each column
 * @throws IOException
 *           if any I/O error occurs while reading
 */
public PageReadStore readNextFilteredRowGroup() throws IOException {
  if (currentBlock == blocks.size()) {
    return null;
  }
  if (!options.useColumnIndexFilter()) {
    return readNextRowGroup();
  }
  BlockMetaData block = blocks.get(currentBlock);
  if (block.getRowCount() == 0) {
    throw new RuntimeException("Illegal row group of 0 rows");
  }
  ColumnIndexStore ciStore = getColumnIndexStore(currentBlock);
  RowRanges rowRanges = getRowRanges(currentBlock);
  long rowCount = rowRanges.rowCount();
  if (rowCount == 0) {
    // There are no matching rows -> skipping this row-group
    advanceToNextBlock();
    return readNextFilteredRowGroup();
  }
  if (rowCount == block.getRowCount()) {
    // All rows are matching -> fall back to the non-filtering path
    return readNextRowGroup();
  }

  this.currentRowGroup = new ColumnChunkPageReadStore(rowRanges);
  // prepare the list of consecutive parts to read them in one scan
  ChunkListBuilder builder = new ChunkListBuilder();
  List<ConsecutivePartList> allParts = new ArrayList<ConsecutivePartList>();
  ConsecutivePartList currentParts = null;
  for (ColumnChunkMetaData mc : block.getColumns()) {
    ColumnPath pathKey = mc.getPath();
    ColumnDescriptor columnDescriptor = paths.get(pathKey);
    if (columnDescriptor != null) {
      OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath());

      OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex, rowRanges,
          block.getRowCount());
      for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) {
        BenchmarkCounter.incrementTotalBytes(range.getLength());
        long startingPos = range.getOffset();
        // first part or not consecutive => new list
        if (currentParts == null || currentParts.endPos() != startingPos) {
          currentParts = new ConsecutivePartList(startingPos);
          allParts.add(currentParts);
        }
        ChunkDescriptor chunkDescriptor = new ChunkDescriptor(columnDescriptor, mc, startingPos,
            (int) range.getLength());
        currentParts.addChunk(chunkDescriptor);
        builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex);
      }
    }
  }
  // actually read all the chunks
  for (ConsecutivePartList consecutiveChunks : allParts) {
    consecutiveChunks.readAll(f, builder);
  }
  for (Chunk chunk : builder.build()) {
    currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
  }

  // avoid re-reading bytes the dictionary reader is used after this call
  if (nextDictionaryReader != null) {
    nextDictionaryReader.setRowGroup(currentRowGroup);
  }

  advanceToNextBlock();

  return currentRowGroup;
}

Example 17

Source File: Metadata.java From dremio-oss with Apache License 2.0

4 votes

private ParquetFileMetadata getParquetFileMetadata(FileAttributes file, AtomicInteger currentNumSplits, long maxSplits) throws IOException {
  final ParquetMetadata metadata =
    SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER, maxFooterLength);
  final int numSplits = currentNumSplits.addAndGet(metadata.getBlocks().size());
  if (numSplits > maxSplits) {
    throw new TooManySplitsException(
      String.format("Too many splits encountered when processing parquet metadata at file %s, maximum is %d but encountered %d splits thus far.",
        file.getPath(), maxSplits, numSplits));
  }

  final MessageType schema = metadata.getFileMetaData().getSchema();

  Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
  schema.getPaths();
  for (String[] path : schema.getPaths()) {
    originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
  }

  List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();

  ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
  ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
  boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
  ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
  if(logger.isDebugEnabled()){
    logger.debug(containsCorruptDates.toString());
  }
  final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
  int rowGroupIdx = 0;
  for (BlockMetaData rowGroup : metadata.getBlocks()) {
    List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
    long length = 0;
    for (ColumnChunkMetaData col : rowGroup.getColumns()) {
      ColumnMetadata columnMetadata;

      // statistics might just have the non-null counts with no min/max they might be
      // initialized to zero instead of null.
      // check statistics actually have non null values (or) column has all nulls.
      boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty()
        && (col.getStatistics().hasNonNullValue()) || col.getStatistics().getNumNulls() ==
        rowGroup.getRowCount());

      Statistics<?> stats = col.getStatistics();
      String[] columnName = col.getPath().toArray();
      SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
      ColumnTypeMetadata columnTypeMetadata =
          new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));

      columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
      if (statsAvailable) {
        // Write stats only if minVal==maxVal. Also, we then store only maxVal
        Object mxValue = null;
        if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
            stats.genericGetMax().equals(stats.genericGetMin())) {
          mxValue = stats.genericGetMax();
          if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
              && columnTypeMetadata.originalType == OriginalType.DATE) {
            mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
          }
        }
        columnMetadata =
            new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
      } else {
        // log it under trace to avoid lot of log entries.
        logger.trace("Stats are not available for column {}, rowGroupIdx {}, file {}",
            columnSchemaName, rowGroupIdx, file.getPath());
        columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
      }
      columnMetadataList.add(columnMetadata);
      length += col.getTotalSize();
    }

    RowGroupMetadata rowGroupMeta =
        new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
            getHostAffinity(fs, file, rowGroup.getStartingPos(), length), columnMetadataList);

    rowGroupMetadataList.add(rowGroupMeta);
    rowGroupIdx++;
  }

  return new ParquetFileMetadata(file, file.size(), rowGroupMetadataList, columnTypeInfo);
}

Example 18

Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0

4 votes

private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) {
    //rowGroup.total_byte_size = ;
    List<ColumnChunkMetaData> columns = block.getColumns();
    List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset
      columnChunk.file_path = block.getPath(); // they are in the same file for now
      columnChunk.meta_data = new ColumnMetaData(
          getType(columnMetaData.getType()),
          toFormatEncodings(columnMetaData.getEncodings()),
          Arrays.asList(columnMetaData.getPath().toArray()),
          toFormatCodec(columnMetaData.getCodec()),
          columnMetaData.getValueCount(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getTotalSize(),
          columnMetaData.getFirstDataPageOffset());
      if (columnMetaData.getEncodingStats() != null && columnMetaData.getEncodingStats().hasDictionaryPages()) {
        columnChunk.meta_data.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset());
      }
      columnChunk.meta_data.setBloom_filter_offset(columnMetaData.getBloomFilterOffset());
      if (!columnMetaData.getStatistics().isEmpty()) {
        columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics(), this.statisticsTruncateLength));
      }
      if (columnMetaData.getEncodingStats() != null) {
        columnChunk.meta_data.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats()));
      }
//      columnChunk.meta_data.index_page_offset = ;
//      columnChunk.meta_data.key_value_metadata = ; // nothing yet

      IndexReference columnIndexRef = columnMetaData.getColumnIndexReference();
      if (columnIndexRef != null) {
        columnChunk.setColumn_index_offset(columnIndexRef.getOffset());
        columnChunk.setColumn_index_length(columnIndexRef.getLength());
      }
      IndexReference offsetIndexRef = columnMetaData.getOffsetIndexReference();
      if (offsetIndexRef != null) {
        columnChunk.setOffset_index_offset(offsetIndexRef.getOffset());
        columnChunk.setOffset_index_length(offsetIndexRef.getLength());
      }

      parquetColumns.add(columnChunk);
    }
    RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
    rowGroups.add(rowGroup);
  }

Example 19

Source File: ParquetReader.java From iceberg with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
ReadConf(InputFile file, ParquetReadOptions options, Schema expectedSchema, Expression filter,
         Function<MessageType, ParquetValueReader<?>> readerFunc, boolean reuseContainers) {
  this.file = file;
  this.options = options;
  this.reader = newReader(file, options);

  MessageType fileSchema = reader.getFileMetaData().getSchema();

  boolean hasIds = hasIds(fileSchema);
  MessageType typeWithIds = hasIds ? fileSchema : addFallbackIds(fileSchema);

  this.projection = hasIds ?
      pruneColumns(fileSchema, expectedSchema) :
      pruneColumnsFallback(fileSchema, expectedSchema);
  this.model = (ParquetValueReader<T>) readerFunc.apply(typeWithIds);
  this.rowGroups = reader.getRowGroups();
  this.shouldSkip = new boolean[rowGroups.size()];

  ParquetMetricsRowGroupFilter statsFilter = null;
  ParquetDictionaryRowGroupFilter dictFilter = null;
  if (filter != null) {
    statsFilter = new ParquetMetricsRowGroupFilter(expectedSchema, filter);
    dictFilter = new ParquetDictionaryRowGroupFilter(expectedSchema, filter);
  }

  long totalValues = 0L;
  for (int i = 0; i < shouldSkip.length; i += 1) {
    BlockMetaData rowGroup = rowGroups.get(i);
    boolean shouldRead = filter == null || (
        statsFilter.shouldRead(typeWithIds, rowGroup) &&
        dictFilter.shouldRead(typeWithIds, rowGroup, reader.getDictionaryReader(rowGroup)));
    this.shouldSkip[i] = !shouldRead;
    if (shouldRead) {
      totalValues += rowGroup.getRowCount();
    }
  }

  this.totalValues = totalValues;
  this.reuseContainers = reuseContainers;
}

Example 20

Source File: ReadConf.java From iceberg with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
ReadConf(InputFile file, ParquetReadOptions options, Schema expectedSchema, Expression filter,
         Function<MessageType, ParquetValueReader<?>> readerFunc, Function<MessageType,
         VectorizedReader<?>> batchedReaderFunc, NameMapping nameMapping, boolean reuseContainers,
         boolean caseSensitive, Integer bSize) {
  this.file = file;
  this.options = options;
  this.reader = newReader(file, options);
  MessageType fileSchema = reader.getFileMetaData().getSchema();

  MessageType typeWithIds;
  if (ParquetSchemaUtil.hasIds(fileSchema)) {
    typeWithIds = fileSchema;
    this.projection = ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema);
  } else if (nameMapping != null) {
    typeWithIds = ParquetSchemaUtil.applyNameMapping(fileSchema, nameMapping);
    this.projection = ParquetSchemaUtil.pruneColumns(typeWithIds, expectedSchema);
  } else {
    typeWithIds = ParquetSchemaUtil.addFallbackIds(fileSchema);
    this.projection = ParquetSchemaUtil.pruneColumnsFallback(fileSchema, expectedSchema);
  }

  this.rowGroups = reader.getRowGroups();
  this.shouldSkip = new boolean[rowGroups.size()];

  ParquetMetricsRowGroupFilter statsFilter = null;
  ParquetDictionaryRowGroupFilter dictFilter = null;
  if (filter != null) {
    statsFilter = new ParquetMetricsRowGroupFilter(expectedSchema, filter, caseSensitive);
    dictFilter = new ParquetDictionaryRowGroupFilter(expectedSchema, filter, caseSensitive);
  }

  long computedTotalValues = 0L;
  for (int i = 0; i < shouldSkip.length; i += 1) {
    BlockMetaData rowGroup = rowGroups.get(i);
    boolean shouldRead = filter == null || (
        statsFilter.shouldRead(typeWithIds, rowGroup) &&
            dictFilter.shouldRead(typeWithIds, rowGroup, reader.getDictionaryReader(rowGroup)));
    this.shouldSkip[i] = !shouldRead;
    if (shouldRead) {
      computedTotalValues += rowGroup.getRowCount();
    }
  }

  this.totalValues = computedTotalValues;
  if (readerFunc != null) {
    this.model = (ParquetValueReader<T>) readerFunc.apply(typeWithIds);
    this.vectorizedModel = null;
    this.columnChunkMetaDataForRowGroups = null;
  } else {
    this.model = null;
    this.vectorizedModel = (VectorizedReader<T>) batchedReaderFunc.apply(typeWithIds);
    this.columnChunkMetaDataForRowGroups = getColumnChunkMetadataForRowGroups();
  }

  this.reuseContainers = reuseContainers;
  this.batchSize = bSize;
}