Java Code Examples for org.apache.parquet.hadoop.metadata.BlockMetaData#getColumns()

The following examples show how to use org.apache.parquet.hadoop.metadata.BlockMetaData#getColumns() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: PredicateUtils.java From presto with Apache License 2.0

6 votes

private static boolean dictionaryPredicatesMatch(Predicate parquetPredicate, BlockMetaData blockMetadata, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain)
{
    for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) {
        RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray()));
        if (descriptor != null) {
            if (isOnlyDictionaryEncodingPages(columnMetaData) && isColumnPredicate(descriptor, parquetTupleDomain)) {
                byte[] buffer = new byte[toIntExact(columnMetaData.getTotalSize())];
                dataSource.readFully(columnMetaData.getStartingPos(), buffer);
                //  Early abort, predicate already filters block so no more dictionaries need be read
                if (!parquetPredicate.matches(new DictionaryDescriptor(descriptor, readDictionaryPage(buffer, columnMetaData.getCodec())))) {
                    return false;
                }
            }
        }
    }
    return true;
}

Example 2

Source File: ParquetRecordReader.java From parquet-mr with Apache License 2.0

6 votes

private void checkDeltaByteArrayProblem(FileMetaData meta, Configuration conf, BlockMetaData block) {
  // splitting files?
  if (conf.getBoolean(ParquetInputFormat.SPLIT_FILES, true)) {
    // this is okay if not using DELTA_BYTE_ARRAY with the bug
    Set<Encoding> encodings = new HashSet<Encoding>();
    for (ColumnChunkMetaData column : block.getColumns()) {
      encodings.addAll(column.getEncodings());
    }
    for (Encoding encoding : encodings) {
      if (CorruptDeltaByteArrays.requiresSequentialReads(meta.getCreatedBy(), encoding)) {
        throw new ParquetDecodingException("Cannot read data due to " +
            "PARQUET-246: to read safely, set " + SPLIT_FILES + " to false");
      }
    }
  }
}

Example 3

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

6 votes

private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) {
  if (rowGroup.getRowCount() <= 0) {
    return ROWS_CANNOT_MATCH;
  }

  this.stats = Maps.newHashMap();
  this.valueCounts = Maps.newHashMap();
  this.conversions = Maps.newHashMap();
  for (ColumnChunkMetaData col : rowGroup.getColumns()) {
    PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType();
    if (colType.getId() != null) {
      int id = colType.getId().intValue();
      stats.put(id, col.getStatistics());
      valueCounts.put(id, col.getValueCount());
      conversions.put(id, ParquetConversions.converterFromParquet(colType));
    }
  }

  return ExpressionVisitors.visitEvaluator(expr, this);
}

Example 4

Source File: ParquetMetricsRowGroupFilter.java From iceberg with Apache License 2.0

6 votes

private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) {
  if (rowGroup.getRowCount() <= 0) {
    return ROWS_CANNOT_MATCH;
  }

  this.stats = Maps.newHashMap();
  this.valueCounts = Maps.newHashMap();
  this.conversions = Maps.newHashMap();
  for (ColumnChunkMetaData col : rowGroup.getColumns()) {
    PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType();
    if (colType.getId() != null) {
      int id = colType.getId().intValue();
      stats.put(id, col.getStatistics());
      valueCounts.put(id, col.getValueCount());
      conversions.put(id, converterFromParquet(colType));
    }
  }

  return ExpressionVisitors.visit(expr, this);
}

Example 5

Source File: CompressionConverter.java From parquet-mr with Apache License 2.0

6 votes

public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema,
                           String createdBy, CompressionCodecName codecName) throws IOException {
  int blockIndex = 0;
  PageReadStore store = reader.readNextRowGroup();
  while (store != null) {
    writer.startBlock(store.getRowCount());
    BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex);
    List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns();
    Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(
      Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
      ColumnChunkMetaData chunk = columnsInOrder.get(i);
      ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy);
      ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath());
      writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName);
      processChunk(reader, writer, chunk, createdBy, codecName);
      writer.endColumn();
    }
    writer.endBlock();
    store = reader.readNextRowGroup();
    blockIndex++;
  }
}

Example 6

Source File: PrintFooter.java From parquet-mr with Apache License 2.0

6 votes

private static void add(ParquetMetadata footer) {
  for (BlockMetaData blockMetaData : footer.getBlocks()) {
    ++ blockCount;
    MessageType schema = footer.getFileMetaData().getSchema();
    recordCount += blockMetaData.getRowCount();
    List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
      add(
          desc,
          columnMetaData.getValueCount(),
          columnMetaData.getTotalSize(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getEncodings(),
          columnMetaData.getStatistics());
    }
  }
}

Example 7

Source File: ColumnSizeCommand.java From parquet-mr with Apache License 2.0

5 votes

public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException {
  Map<String, Long> colSizes = new HashMap<>();
  ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER);

  for (BlockMetaData block : pmd.getBlocks()) {
    for (ColumnChunkMetaData column : block.getColumns()) {
      String colName = column.getPath().toDotString();
      colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L));
    }
  }

  return colSizes;
}

Example 8

Source File: BloomFilterReader.java From parquet-mr with Apache License 2.0

5 votes

public BloomFilterReader(ParquetFileReader fileReader, BlockMetaData block) {
  this.reader = fileReader;
  this.columns = new HashMap<>();
  for (ColumnChunkMetaData column : block.getColumns()) {
    columns.put(column.getPath(), column);
  }
}

Example 9

Source File: ParquetInputSplit.java From parquet-mr with Apache License 2.0

5 votes

private static long end(List<BlockMetaData> blocks, String requestedSchema) {
  MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
  long length = 0;

  for (BlockMetaData block : blocks) {
    List<ColumnChunkMetaData> columns = block.getColumns();
    for (ColumnChunkMetaData column : columns) {
      if (requested.containsPath(column.getPath().toArray())) {
        length += column.getTotalSize();
      }
    }
  }
  return length;
}

Example 10

Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0

5 votes

public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException {
  MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
  long length = 0;

  for (BlockMetaData block : this.getRowGroups()) {
    List<ColumnChunkMetaData> columns = block.getColumns();
    for (ColumnChunkMetaData column : columns) {
      if (requested.containsPath(column.getPath().toArray())) {
        length += column.getTotalSize();
      }
    }
  }

  BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1);
  long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();

  long[] rowGroupOffsets = new long[this.getRowGroupCount()];
  for (int i = 0; i < rowGroupOffsets.length; i++) {
    rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos();
  }

  return new ParquetInputSplit(
          fileStatus.getPath(),
          hdfsBlock.getOffset(),
          end,
          length,
          hdfsBlock.getHosts(),
          rowGroupOffsets
  );
}

Example 11

Source File: ColumnIndexStoreImpl.java From parquet-mr with Apache License 2.0

5 votes

private ColumnIndexStoreImpl(ParquetFileReader reader, BlockMetaData block, Set<ColumnPath> paths) {
  // TODO[GS]: Offset index for every paths will be required; pre-read the consecutive ones at once?
  // TODO[GS]: Pre-read column index based on filter?
  this.reader = reader;
  Map<ColumnPath, IndexStore> store = new HashMap<>();
  for (ColumnChunkMetaData column : block.getColumns()) {
    ColumnPath path = column.getPath();
    if (paths.contains(path)) {
      store.put(path, new IndexStoreImpl(column));
    }
  }
  this.store = store;
}

Example 12

Source File: DictionaryPageReader.java From parquet-mr with Apache License 2.0

5 votes

/**
 * Instantiate a new DictionaryPageReader.
 *
 * @param reader The target ParquetFileReader
 * @param block The target BlockMetaData
 *
 * @throws NullPointerException if {@code reader} or {@code block} is
 *           {@code null}
 */
DictionaryPageReader(ParquetFileReader reader, BlockMetaData block) {
  this.reader = Objects.requireNonNull(reader);
  this.columns = new HashMap<>();
  this.dictionaryPageCache = new ConcurrentHashMap<>();

  for (ColumnChunkMetaData column : block.getColumns()) {
    columns.put(column.getPath().toDotString(), column);
  }
}

Example 13

Source File: ParquetMetadataCommand.java From parquet-mr with Apache License 2.0

5 votes

private void printRowGroup(Logger console, int index, BlockMetaData rowGroup, MessageType schema) {
  long start = rowGroup.getStartingPos();
  long rowCount = rowGroup.getRowCount();
  long compressedSize = rowGroup.getCompressedSize();
  long uncompressedSize = rowGroup.getTotalByteSize();
  String filePath = rowGroup.getPath();

  console.info(String.format("\nRow group %d:  count: %d  %s records  start: %d  total: %s%s\n%s",
      index, rowCount,
      humanReadable(((float) compressedSize) / rowCount),
      start, humanReadable(compressedSize),
      filePath != null ? " path: " + filePath : "",
      new TextStringBuilder(80).appendPadding(80, '-')));

  int size = maxSize(Iterables.transform(rowGroup.getColumns(),
      new Function<ColumnChunkMetaData, String>() {
        @Override
        public String apply(@Nullable ColumnChunkMetaData input) {
          return input == null ? "" : input.getPath().toDotString();
        }
      }));

  console.info(String.format("%-" + size + "s  %-9s %-9s %-9s %-10s %-7s %s",
      "", "type", "encodings", "count", "avg size", "nulls", "min / max"));
  for (ColumnChunkMetaData column : rowGroup.getColumns()) {
    printColumnChunk(console, size, column, schema);
  }
}

Example 14

Source File: LocalDictionariesReader.java From dremio-oss with Apache License 2.0

5 votes

/**
 * Return dictionary per row group for all binary columns in given parquet file.
 * @param fs filesystem object.
 * @param filePath parquet file to scan
 * @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
 * @throws IOException
 */
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CompressionCodecFactory codecFactory) throws IOException {
  // Passing the max footer length is not required in this case as the parquet reader would already have failed.
  final ParquetMetadata parquetMetadata = SingletonParquetFooterCache.readFooter(fs, filePath, ParquetMetadataConverter.NO_FILTER,
    ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  if (parquetMetadata.getBlocks().size() > 1) {
    throw new IOException(
      format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
        parquetMetadata.getBlocks().size(), filePath));
  }
  final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
  final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();

  for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
    columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
  }

  final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
  final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
  try(final FSInputStream in = fs.open(filePath)) {
    for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
      if (isBinaryType(columnChunkMetaData.getType())) {
        final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
        // if first page is dictionary encoded then load dictionary, otherwise skip this column.
        final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
        if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
          dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
        } else {
          columnsToSkip.add(column);
        }
      }
    }
  }
  return new ImmutablePair<>(dictionaries, columnsToSkip);
}

Example 15

Source File: ParquetReader.java From presto with Apache License 2.0

5 votes

private ColumnChunkMetaData getColumnChunkMetaData(BlockMetaData blockMetaData, ColumnDescriptor columnDescriptor)
        throws IOException
{
    for (ColumnChunkMetaData metadata : blockMetaData.getColumns()) {
        if (metadata.getPath().equals(ColumnPath.get(columnDescriptor.getPath()))) {
            return metadata;
        }
    }
    throw new ParquetCorruptionException("Metadata is missing for column: %s", columnDescriptor);
}

Example 16

Source File: TestParquetWriter.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  Path root = new Path("target/tests/TestParquetWriter/");
  enforceEmptyDir(conf, root);
  MessageType schema = parseMessageType(
      "message test { "
      + "required binary binary_field; "
      + "required int32 int32_field; "
      + "required int64 int64_field; "
      + "required boolean boolean_field; "
      + "required float float_field; "
      + "required double double_field; "
      + "required fixed_len_byte_array(3) flba_field; "
      + "required int96 int96_field; "
      + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  Map<String, Encoding> expected = new HashMap<String, Encoding>();
  expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
  expected.put("1000-" + PARQUET_1_0, PLAIN);
  expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
  expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
  for (int modulo : asList(10, 1000)) {
    for (WriterVersion version : WriterVersion.values()) {
      Path file = new Path(root, version.name() + "_" + modulo);
      ParquetWriter<Group> writer = new ParquetWriter<Group>(
          file,
          new GroupWriteSupport(),
          UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
      for (int i = 0; i < 1000; i++) {
        writer.write(
            f.newGroup()
            .append("binary_field", "test" + (i % modulo))
            .append("int32_field", 32)
            .append("int64_field", 64l)
            .append("boolean_field", true)
            .append("float_field", 1.0f)
            .append("double_field", 2.0d)
            .append("flba_field", "foo")
            .append("int96_field", Binary.fromConstantByteArray(new byte[12])));
      }
      writer.close();
      ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
      for (int i = 0; i < 1000; i++) {
        Group group = reader.read();
        assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
        assertEquals(32, group.getInteger("int32_field", 0));
        assertEquals(64l, group.getLong("int64_field", 0));
        assertEquals(true, group.getBoolean("boolean_field", 0));
        assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
        assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
        assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
        assertEquals(Binary.fromConstantByteArray(new byte[12]),
            group.getInt96("int96_field",0));
      }
      reader.close();
      ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
      for (BlockMetaData blockMetaData : footer.getBlocks()) {
        for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
          if (column.getPath().toDotString().equals("binary_field")) {
            String key = modulo + "-" + version;
            Encoding expectedEncoding = expected.get(key);
            assertTrue(
                key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
                column.getEncodings().contains(expectedEncoding));
          }
        }
      }
      assertEquals("Object model property should be example",
          "example", footer.getFileMetaData().getKeyValueMetaData()
              .get(ParquetWriter.OBJECT_MODEL_NAME_PROP));
    }
  }
}

Example 17

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

4 votes

/**
 * Reads all the columns requested from the row group at the current file position. It may skip specific pages based
 * on the column indexes according to the actual filter. As the rows are not aligned among the pages of the different
 * columns row synchronization might be required. See the documentation of the class SynchronizingColumnReader for
 * details.
 *
 * @return the PageReadStore which can provide PageReaders for each column
 * @throws IOException
 *           if any I/O error occurs while reading
 */
public PageReadStore readNextFilteredRowGroup() throws IOException {
  if (currentBlock == blocks.size()) {
    return null;
  }
  if (!options.useColumnIndexFilter()) {
    return readNextRowGroup();
  }
  BlockMetaData block = blocks.get(currentBlock);
  if (block.getRowCount() == 0) {
    throw new RuntimeException("Illegal row group of 0 rows");
  }
  ColumnIndexStore ciStore = getColumnIndexStore(currentBlock);
  RowRanges rowRanges = getRowRanges(currentBlock);
  long rowCount = rowRanges.rowCount();
  if (rowCount == 0) {
    // There are no matching rows -> skipping this row-group
    advanceToNextBlock();
    return readNextFilteredRowGroup();
  }
  if (rowCount == block.getRowCount()) {
    // All rows are matching -> fall back to the non-filtering path
    return readNextRowGroup();
  }

  this.currentRowGroup = new ColumnChunkPageReadStore(rowRanges);
  // prepare the list of consecutive parts to read them in one scan
  ChunkListBuilder builder = new ChunkListBuilder();
  List<ConsecutivePartList> allParts = new ArrayList<ConsecutivePartList>();
  ConsecutivePartList currentParts = null;
  for (ColumnChunkMetaData mc : block.getColumns()) {
    ColumnPath pathKey = mc.getPath();
    ColumnDescriptor columnDescriptor = paths.get(pathKey);
    if (columnDescriptor != null) {
      OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath());

      OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex, rowRanges,
          block.getRowCount());
      for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) {
        BenchmarkCounter.incrementTotalBytes(range.getLength());
        long startingPos = range.getOffset();
        // first part or not consecutive => new list
        if (currentParts == null || currentParts.endPos() != startingPos) {
          currentParts = new ConsecutivePartList(startingPos);
          allParts.add(currentParts);
        }
        ChunkDescriptor chunkDescriptor = new ChunkDescriptor(columnDescriptor, mc, startingPos,
            (int) range.getLength());
        currentParts.addChunk(chunkDescriptor);
        builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex);
      }
    }
  }
  // actually read all the chunks
  for (ConsecutivePartList consecutiveChunks : allParts) {
    consecutiveChunks.readAll(f, builder);
  }
  for (Chunk chunk : builder.build()) {
    currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
  }

  // avoid re-reading bytes the dictionary reader is used after this call
  if (nextDictionaryReader != null) {
    nextDictionaryReader.setRowGroup(currentRowGroup);
  }

  advanceToNextBlock();

  return currentRowGroup;
}

Example 18

Source File: Metadata.java From dremio-oss with Apache License 2.0

4 votes

private ParquetFileMetadata getParquetFileMetadata(FileAttributes file, AtomicInteger currentNumSplits, long maxSplits) throws IOException {
  final ParquetMetadata metadata =
    SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER, maxFooterLength);
  final int numSplits = currentNumSplits.addAndGet(metadata.getBlocks().size());
  if (numSplits > maxSplits) {
    throw new TooManySplitsException(
      String.format("Too many splits encountered when processing parquet metadata at file %s, maximum is %d but encountered %d splits thus far.",
        file.getPath(), maxSplits, numSplits));
  }

  final MessageType schema = metadata.getFileMetaData().getSchema();

  Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
  schema.getPaths();
  for (String[] path : schema.getPaths()) {
    originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
  }

  List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();

  ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
  ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
  boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
  ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
  if(logger.isDebugEnabled()){
    logger.debug(containsCorruptDates.toString());
  }
  final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
  int rowGroupIdx = 0;
  for (BlockMetaData rowGroup : metadata.getBlocks()) {
    List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
    long length = 0;
    for (ColumnChunkMetaData col : rowGroup.getColumns()) {
      ColumnMetadata columnMetadata;

      // statistics might just have the non-null counts with no min/max they might be
      // initialized to zero instead of null.
      // check statistics actually have non null values (or) column has all nulls.
      boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty()
        && (col.getStatistics().hasNonNullValue()) || col.getStatistics().getNumNulls() ==
        rowGroup.getRowCount());

      Statistics<?> stats = col.getStatistics();
      String[] columnName = col.getPath().toArray();
      SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
      ColumnTypeMetadata columnTypeMetadata =
          new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));

      columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
      if (statsAvailable) {
        // Write stats only if minVal==maxVal. Also, we then store only maxVal
        Object mxValue = null;
        if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
            stats.genericGetMax().equals(stats.genericGetMin())) {
          mxValue = stats.genericGetMax();
          if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
              && columnTypeMetadata.originalType == OriginalType.DATE) {
            mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
          }
        }
        columnMetadata =
            new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
      } else {
        // log it under trace to avoid lot of log entries.
        logger.trace("Stats are not available for column {}, rowGroupIdx {}, file {}",
            columnSchemaName, rowGroupIdx, file.getPath());
        columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
      }
      columnMetadataList.add(columnMetadata);
      length += col.getTotalSize();
    }

    RowGroupMetadata rowGroupMeta =
        new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
            getHostAffinity(fs, file, rowGroup.getStartingPos(), length), columnMetadataList);

    rowGroupMetadataList.add(rowGroupMeta);
    rowGroupIdx++;
  }

  return new ParquetFileMetadata(file, file.size(), rowGroupMetadataList, columnTypeInfo);
}

Example 19

Source File: TestParquetWriterNewPage.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  Path root = new Path("target/tests/TestParquetWriter/");
  FileSystem fs = root.getFileSystem(conf);
  if (fs.exists(root)) {
    fs.delete(root, true);
  }
  fs.mkdirs(root);
  MessageType schema = parseMessageType(
      "message test { "
      + "required binary binary_field; "
      + "required int32 int32_field; "
      + "required int64 int64_field; "
      + "required boolean boolean_field; "
      + "required float float_field; "
      + "required double double_field; "
      + "required fixed_len_byte_array(3) flba_field; "
      + "required int96 int96_field; "
      + "optional binary null_field; "
      + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  Map<String, Encoding> expected = new HashMap<String, Encoding>();
  expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
  expected.put("1000-" + PARQUET_1_0, PLAIN);
  expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
  expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
  for (int modulo : asList(10, 1000)) {
    for (WriterVersion version : WriterVersion.values()) {
      Path file = new Path(root, version.name() + "_" + modulo);
      ParquetWriter<Group> writer = new ParquetWriter<Group>(
          file,
          new GroupWriteSupport(),
          UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
      for (int i = 0; i < 1000; i++) {
        writer.write(
            f.newGroup()
            .append("binary_field", "test" + (i % modulo))
            .append("int32_field", 32)
            .append("int64_field", 64l)
            .append("boolean_field", true)
            .append("float_field", 1.0f)
            .append("double_field", 2.0d)
            .append("flba_field", "foo")
            .append("int96_field", Binary.fromConstantByteArray(new byte[12])));
      }
      writer.close();

      ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
      for (int i = 0; i < 1000; i++) {
        Group group = reader.read();
        assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
        assertEquals(32, group.getInteger("int32_field", 0));
        assertEquals(64l, group.getLong("int64_field", 0));
        assertEquals(true, group.getBoolean("boolean_field", 0));
        assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
        assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
        assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
        assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field",
            0));
        assertEquals(0, group.getFieldRepetitionCount("null_field"));
      }
      reader.close();
      ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
      for (BlockMetaData blockMetaData : footer.getBlocks()) {
        for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
          if (column.getPath().toDotString().equals("binary_field")) {
            String key = modulo + "-" + version;
            Encoding expectedEncoding = expected.get(key);
            assertTrue(
                key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
                column.getEncodings().contains(expectedEncoding));
          }
        }
      }
    }
  }
}

Example 20

Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0

4 votes

private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) {
    //rowGroup.total_byte_size = ;
    List<ColumnChunkMetaData> columns = block.getColumns();
    List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset
      columnChunk.file_path = block.getPath(); // they are in the same file for now
      columnChunk.meta_data = new ColumnMetaData(
          getType(columnMetaData.getType()),
          toFormatEncodings(columnMetaData.getEncodings()),
          Arrays.asList(columnMetaData.getPath().toArray()),
          toFormatCodec(columnMetaData.getCodec()),
          columnMetaData.getValueCount(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getTotalSize(),
          columnMetaData.getFirstDataPageOffset());
      if (columnMetaData.getEncodingStats() != null && columnMetaData.getEncodingStats().hasDictionaryPages()) {
        columnChunk.meta_data.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset());
      }
      columnChunk.meta_data.setBloom_filter_offset(columnMetaData.getBloomFilterOffset());
      if (!columnMetaData.getStatistics().isEmpty()) {
        columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics(), this.statisticsTruncateLength));
      }
      if (columnMetaData.getEncodingStats() != null) {
        columnChunk.meta_data.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats()));
      }
//      columnChunk.meta_data.index_page_offset = ;
//      columnChunk.meta_data.key_value_metadata = ; // nothing yet

      IndexReference columnIndexRef = columnMetaData.getColumnIndexReference();
      if (columnIndexRef != null) {
        columnChunk.setColumn_index_offset(columnIndexRef.getOffset());
        columnChunk.setColumn_index_length(columnIndexRef.getLength());
      }
      IndexReference offsetIndexRef = columnMetaData.getOffsetIndexReference();
      if (offsetIndexRef != null) {
        columnChunk.setOffset_index_offset(offsetIndexRef.getOffset());
        columnChunk.setOffset_index_length(offsetIndexRef.getLength());
      }

      parquetColumns.add(columnChunk);
    }
    RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
    rowGroups.add(rowGroup);
  }