org.apache.parquet.format.converter.ParquetMetadataConverter Java Examples

The following examples show how to use org.apache.parquet.format.converter.ParquetMetadataConverter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException {
  this.converter = new ParquetMetadataConverter(options);
  this.file = file;
  this.f = file.newStream();
  this.options = options;
  try {
    this.footer = readFooter(file, options, f, converter);
  } catch (Exception e) {
    // In case that reading footer throws an exception in the constructor, the new stream
    // should be closed. Otherwise, there's no way to close this outside.
    f.close();
    throw e;
  }
  this.fileMetaData = footer.getFileMetaData();
  this.blocks = filterRowGroups(footer.getBlocks());
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}

Example #2

Source File: ParquetUtils.java From incubator-pinot with Apache License 2.0

6 votes

/**
 * Returns the schema for the given Parquet file path.
 */
public static Schema getParquetSchema(Path path)
    throws IOException {
  ParquetMetadata footer = ParquetFileReader.readFooter(getConfiguration(), path, ParquetMetadataConverter.NO_FILTER);
  Map<String, String> metaData = footer.getFileMetaData().getKeyValueMetaData();
  String schemaString = metaData.get("parquet.avro.schema");
  if (schemaString == null) {
    // Try the older property
    schemaString = metaData.get("avro.schema");
  }
  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema());
  }
}

Example #3

Source File: SchemaCommand.java From parquet-mr with Apache License 2.0

6 votes

private String getParquetSchema(String source) throws IOException {
  Formats.Format format;
  try (SeekableInput in = openSeekable(source)) {
    format = Formats.detectFormat((InputStream) in);
    in.seek(0);

    switch (format) {
      case PARQUET:
        return new ParquetFileReader(
            getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER)
            .getFileMetaData().getSchema().toString();
      default:
        throw new IllegalArgumentException(String.format(
            "Could not get a Parquet schema for format %s: %s", format, source));
    }
  }
}

Example #4

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

6 votes

@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}

Example #5

Source File: CompressionConverter.java From parquet-mr with Apache License 2.0

6 votes

private Statistics convertStatistics(String createdBy, PrimitiveType type, org.apache.parquet.format.Statistics pageStatistics,
                                     ColumnIndex columnIndex, int pageIndex, ParquetMetadataConverter converter) throws IOException {
  if (columnIndex != null) {
    if (columnIndex.getNullPages() == null) {
      throw new IOException("columnIndex has null variable 'nullPages' which indicates corrupted data for type: " +  type.getName());
    }
    if (pageIndex > columnIndex.getNullPages().size()) {
      throw new IOException("There are more pages " + pageIndex + " found in the column than in the columnIndex " + columnIndex.getNullPages().size());
    }
    org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type);
    statsBuilder.withNumNulls(columnIndex.getNullCounts().get(pageIndex));

    if (!columnIndex.getNullPages().get(pageIndex)) {
      statsBuilder.withMin(columnIndex.getMinValues().get(pageIndex).array().clone());
      statsBuilder.withMax(columnIndex.getMaxValues().get(pageIndex).array().clone());
    }
    return statsBuilder.build();
  } else if (pageStatistics != null) {
    return converter.fromParquetStatistics(createdBy, pageStatistics, type);
  } else {
    return null;
  }
}

Example #6

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

/**
 * Given a list of metadata files, merge them into a single ParquetMetadata
 * Requires that the schemas be compatible, and the extraMetadata be exactly equal.
 * @param files a list of files to merge metadata from
 * @param conf a configuration
 * @return merged parquet metadata for the files
 * @throws IOException if there is an error while writing
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files,  Configuration conf) throws IOException {
  Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");

  GlobalMetaData globalMetaData = null;
  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();

  for (Path p : files) {
    ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
    FileMetaData fmd = pmd.getFileMetaData();
    globalMetaData = mergeInto(fmd, globalMetaData, true);
    blocks.addAll(pmd.getBlocks());
  }

  // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
  return new ParquetMetadata(globalMetaData.merge(), blocks);
}

Example #7

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeBloomFilters(
  List<Map<String, BloomFilter>> bloomFilters,
  List<BlockMetaData> blocks,
  PositionOutputStream out) throws IOException {
  LOG.debug("{}: bloom filters", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
    if (blockBloomFilters.isEmpty()) continue;
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
      if (bloomFilter == null) {
        continue;
      }

      long offset = out.getPos();
      column.setBloomFilterOffset(offset);
      Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out);
      bloomFilter.writeTo(out);
    }
  }
}

Example #8

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeOffsetIndexes(
    List<List<OffsetIndex>> offsetIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: offset indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<OffsetIndex> blockOffsetIndexes = offsetIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
      if (offsetIndex == null) {
        continue;
      }
      ColumnChunkMetaData column = columns.get(cIndex);
      long offset = out.getPos();
      Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out);
      column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}

Example #9

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeColumnIndexes(
    List<List<ColumnIndex>> columnIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: column indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter
          .toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
      if (columnIndex == null) {
        continue;
      }
      long offset = out.getPos();
      Util.writeColumnIndex(columnIndex, out);
      column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}

Example #10

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

/**
 * FOR TESTING ONLY. This supports testing block padding behavior on the local FS.
 *
 * @param configuration Hadoop configuration
 * @param schema the schema of the data
 * @param file the file to write to
 * @param rowAndBlockSize the row group size
 * @param maxPaddingSize the maximum padding
 * @throws IOException if the file can not be created
 */
ParquetFileWriter(Configuration configuration, MessageType schema,
                  Path file, long rowAndBlockSize, int maxPaddingSize)
    throws IOException {
  FileSystem fs = file.getFileSystem(configuration);
  this.schema = schema;
  this.alignment = PaddingAlignment.get(
      rowAndBlockSize, rowAndBlockSize, maxPaddingSize);
  this.out = HadoopStreams.wrap(
      fs.create(file, true, 8192, fs.getDefaultReplication(file), rowAndBlockSize));
  this.encodingStatsBuilder = new EncodingStats.Builder();
  // no truncation is needed for testing
  this.columnIndexTruncateLength = Integer.MAX_VALUE;
  this.pageWriteChecksumEnabled = ParquetOutputFormat.getPageWriteChecksumEnabled(configuration);
  this.crc = pageWriteChecksumEnabled ? new CRC32() : null;
  this.metadataConverter = new ParquetMetadataConverter(ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
}

Example #11

Source File: ParquetReaderUtility.java From Bats with Apache License 2.0

6 votes

/**
 * Map full schema paths in format `a`.`b`.`c` to respective SchemaElement objects.
 *
 * @param footer Parquet file metadata
 * @return       schema full path to SchemaElement map
 */
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  Map<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);

  Iterator<SchemaElement> iter = fileMetaData.getSchema().iterator();

  // First element in collection is default `root` element. We skip it to maintain key in `a` format instead of `root`.`a`,
  // and thus to avoid the need to cut it out again when comparing with SchemaPath string representation
  if (iter.hasNext()) {
    iter.next();
  }
  while (iter.hasNext()) {
    addSchemaElementMapping(iter, new StringBuilder(), schemaElements);
  }
  return schemaElements;
}

Example #12

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param conf the Hadoop Configuration
 * @param file Path to a parquet file
 * @param footer a {@link ParquetMetadata} footer already read from the file
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(Configuration conf, Path file, ParquetMetadata footer) throws IOException {
  this.converter = new ParquetMetadataConverter(conf);
  this.file = HadoopInputFile.fromPath(file, conf);
  this.f = this.file.newStream();
  this.options = HadoopReadOptions.builder(conf).build();
  this.footer = footer;
  this.fileMetaData = footer.getFileMetaData();
  this.blocks = filterRowGroups(footer.getBlocks());
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}

Example #13

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param configuration the Hadoop conf
 * @param fileMetaData fileMetaData for parquet file
 * @param filePath Path for the parquet file
 * @param blocks the blocks to read
 * @param columns the columns to read (their path)
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(
    Configuration configuration, FileMetaData fileMetaData,
    Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException {
  this.converter = new ParquetMetadataConverter(configuration);
  this.file = HadoopInputFile.fromPath(filePath, configuration);
  this.fileMetaData = fileMetaData;
  this.f = file.newStream();
  this.options = HadoopReadOptions.builder(configuration).build();
  this.blocks = filterRowGroups(blocks);
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : columns) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}

Example #14

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testPruneNestedParentColumn() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove parent column. All of it's children will be removed.
  String cargs[] = {inputFile, outputFile, "Links"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 3);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Name");
  assertEquals(fields.get(2).getName(), "Gender");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Links");
  validateColumns(inputFile, prunePaths);
}

Example #15

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testPruneMultiColumns() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove columns
  String cargs[] = {inputFile, outputFile, "Name", "Gender"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 2);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Links");
  List<Type> subFields = fields.get(1).asGroupType().getFields();
  assertEquals(subFields.size(), 2);
  assertEquals(subFields.get(0).getName(), "Backward");
  assertEquals(subFields.get(1).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Name", "Gender");
  validateColumns(inputFile, prunePaths);
}

Example #16

Source File: PrimitiveColumnWriter.java From presto with Apache License 2.0

6 votes

private ColumnMetaData getColumnMetaData()
{
    checkState(getDataStreamsCalled);

    ColumnMetaData columnMetaData = new ColumnMetaData(
            ParquetTypeConverter.getType(columnDescriptor.getPrimitiveType().getPrimitiveTypeName()),
            encodings.stream().map(parquetMetadataConverter::getEncoding).collect(toImmutableList()),
            ImmutableList.copyOf(columnDescriptor.getPath()),
            compressionCodec.getParquetCompressionCodec(),
            totalRows,
            totalUnCompressedSize,
            totalCompressedSize,
            -1);
    columnMetaData.setStatistics(ParquetMetadataConverter.toParquetStatistics(columnStatistics));
    return columnMetaData;
}

Example #17

Source File: TestParquetReader.java From dremio-oss with Apache License 2.0

6 votes

@Test
public void testArrowSchemaOldInFooter() throws Exception {
  URL badparquet = getClass().getResource("/types.parquet");

  Path filePathBad = Path.of(badparquet.toURI());
  ParquetMetadata parquetMetadataBad =
    SingletonParquetFooterCache.readFooter(localFs, filePathBad, ParquetMetadataConverter.NO_FILTER,
      ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  Map<String, String> metadataBad = parquetMetadataBad.getFileMetaData().getKeyValueMetaData();

  // should have DREMIO_ARROW_SCHEMA field, but no DREMIO_ARROW_SCHEMA_2_1
  assertTrue(metadataBad.containsKey(DREMIO_ARROW_SCHEMA));
  assertFalse(metadataBad.containsKey(DREMIO_ARROW_SCHEMA_2_1));

  try {
    DremioArrowSchema.fromMetaData(metadataBad);
    fail("Should not be able to process arrow schema");
  } catch (Exception e) {
    // ok
  }
}

Example #18

Source File: TestParquetReader.java From dremio-oss with Apache License 2.0

6 votes

@Test
public void testArrowSchema210InFooter() throws Exception {
  URL parquet210 = getClass().getResource("/dremio-region-210.parquet");
  Path filePath210 = Path.of(parquet210.toURI());
  ParquetMetadata parquetMetadata210 =
    SingletonParquetFooterCache.readFooter(localFs, filePath210, ParquetMetadataConverter.NO_FILTER,
      ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  Map<String, String> metadata210 = parquetMetadata210.getFileMetaData().getKeyValueMetaData();

  // should not have DREMIO_ARROW_SCHEMA field, but should have DREMIO_ARROW_SCHEMA_2_1
  assertFalse(metadata210.containsKey(DREMIO_ARROW_SCHEMA));
  assertTrue(metadata210.containsKey(DREMIO_ARROW_SCHEMA_2_1));

  Schema schema210 = DremioArrowSchema.fromMetaData(metadata210);

  assertNotNull(schema210);
}

Example #19

Source File: ParquetResolverTest.java From pxf with Apache License 2.0

6 votes

@SuppressWarnings("deprecation")
private List<Group> readParquetFile(String file, long expectedSize, MessageType schema) throws IOException {
    List<Group> result = new ArrayList<>();
    String parquetFile = Objects.requireNonNull(getClass().getClassLoader().getResource("parquet/" + file)).getPath();
    Path path = new Path(parquetFile);

    ParquetFileReader fileReader = new ParquetFileReader(new Configuration(), path, ParquetMetadataConverter.NO_FILTER);
    PageReadStore rowGroup;
    while ((rowGroup = fileReader.readNextRowGroup()) != null) {
        MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
        RecordReader<Group> recordReader = columnIO.getRecordReader(rowGroup, new GroupRecordConverter(schema));
        long rowCount = rowGroup.getRowCount();
        for (long i = 0; i < rowCount; i++) {
            result.add(recordReader.read());
        }
    }
    fileReader.close();
    assertEquals(expectedSize, result.size());
    return result;
}

Example #20

Source File: TestParquetReader.java From dremio-oss with Apache License 2.0

6 votes

@Test
public void testArrowSchema205InFooter() throws Exception {
  URL parquet205 = getClass().getResource("/dremio-region-205.parquet");
  Path filePath = Path.of(parquet205.toURI());
  ParquetMetadata parquetMetadata =
    SingletonParquetFooterCache.readFooter(localFs, filePath, ParquetMetadataConverter.NO_FILTER,
      ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  Map<String, String> metadata = parquetMetadata.getFileMetaData().getKeyValueMetaData();

  // should have DREMIO_ARROW_SCHEMA field, but no DREMIO_ARROW_SCHEMA_2_1
  assertTrue(metadata.containsKey(DREMIO_ARROW_SCHEMA));
  assertFalse(metadata.containsKey(DREMIO_ARROW_SCHEMA_2_1));

  Schema schema = DremioArrowSchema.fromMetaData(metadata);

  assertNotNull(schema);
}

Example #21

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

private static void serializeFooter(ParquetMetadata footer, PositionOutputStream out) throws IOException {
  long footerIndex = out.getPos();
  ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter();
  org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(CURRENT_VERSION, footer);
  writeFileMetaData(parquetMetadata, out);
  LOG.debug("{}: footer length = {}" , out.getPos(), (out.getPos() - footerIndex));
  BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex));
  out.write(MAGIC);
}

Example #22

Source File: ParquetFileAccessor.java From pxf with Apache License 2.0

5 votes

/**
 * Reads the original schema from the parquet file.
 *
 * @param parquetFile the path to the parquet file
 * @param fileSplit   the file split we are accessing
 * @return the original schema from the parquet file
 * @throws IOException when there's an IOException while reading the schema
 */
private MessageType getSchema(Path parquetFile, FileSplit fileSplit) throws IOException {

    final long then = System.nanoTime();
    ParquetMetadataConverter.MetadataFilter filter = ParquetMetadataConverter.range(
            fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength());
    ParquetReadOptions parquetReadOptions = HadoopReadOptions
            .builder(configuration)
            .withMetadataFilter(filter)
            .build();
    HadoopInputFile inputFile = HadoopInputFile.fromPath(parquetFile, configuration);
    try (ParquetFileReader parquetFileReader =
                 ParquetFileReader.open(inputFile, parquetReadOptions)) {
        FileMetaData metadata = parquetFileReader.getFileMetaData();
        if (LOG.isDebugEnabled()) {
            LOG.debug("{}-{}: Reading file {} with {} records in {} RowGroups",
                    context.getTransactionId(), context.getSegmentId(),
                    parquetFile.getName(), parquetFileReader.getRecordCount(),
                    parquetFileReader.getRowGroups().size());
        }
        final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - then);
        LOG.debug("{}-{}: Read schema in {} ms", context.getTransactionId(),
                context.getSegmentId(), millis);
        return metadata.getSchema();
    } catch (Exception e) {
        throw new IOException(e);
    }
}

Example #23

Source File: ColumnSizeCommand.java From parquet-mr with Apache License 2.0

5 votes

public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException {
  Map<String, Long> colSizes = new HashMap<>();
  ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER);

  for (BlockMetaData block : pmd.getBlocks()) {
    for (ColumnChunkMetaData column : block.getColumns()) {
      String colName = column.getPath().toDotString();
      colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L));
    }
  }

  return colSizes;
}

Example #24

Source File: ParquetMetadataCommand.java From parquet-mr with Apache License 2.0

5 votes

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() >= 1,
      "A Parquet file is required.");
  Preconditions.checkArgument(targets.size() == 1,
      "Cannot process multiple Parquet files.");

  String source = targets.get(0);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER);

  console.info("\nFile path:  {}", source);
  console.info("Created by: {}", footer.getFileMetaData().getCreatedBy());

  Map<String, String> kv = footer.getFileMetaData().getKeyValueMetaData();
  if (kv != null && !kv.isEmpty()) {
    console.info("Properties:");
    String format = "  %" + maxSize(kv.keySet()) + "s: %s";
    for (Map.Entry<String, String> entry : kv.entrySet()) {
      console.info(String.format(format, entry.getKey(), entry.getValue()));
    }
  } else {
    console.info("Properties: (none)");
  }

  MessageType schema = footer.getFileMetaData().getSchema();
  console.info("Schema:\n{}", schema);

  List<BlockMetaData> rowGroups = footer.getBlocks();
  for (int index = 0, n = rowGroups.size(); index < n; index += 1) {
    printRowGroup(console, index, rowGroups.get(index), schema);
  }

  console.info("");

  return 0;
}

Example #25

Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0

5 votes

private String check(String file) throws IOException {
  Path path = qualifiedPath(file);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), path, ParquetMetadataConverter.NO_FILTER);

  FileMetaData meta = footer.getFileMetaData();
  String createdBy = meta.getCreatedBy();
  if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
    // create fake metadata that will read corrupt stats and return them
    FileMetaData fakeMeta = new FileMetaData(
        meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);

    // get just the binary columns
    List<ColumnDescriptor> columns = Lists.newArrayList();
    Iterables.addAll(columns, Iterables.filter(
        meta.getSchema().getColumns(),
        new Predicate<ColumnDescriptor>() {
          @Override
          public boolean apply(@Nullable ColumnDescriptor input) {
            return input != null && input.getType() == BINARY;
          }
        }));

    // now check to see if the data is actually corrupt
    ParquetFileReader reader = new ParquetFileReader(getConf(),
        fakeMeta, path, footer.getBlocks(), columns);

    try {
      PageStatsValidator validator = new PageStatsValidator();
      for (PageReadStore pages = reader.readNextRowGroup(); pages != null;
           pages = reader.readNextRowGroup()) {
        validator.validate(columns, pages);
      }
    } catch (BadStatsException e) {
      return e.getMessage();
    }
  }

  return null;
}

Example #26

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

5 votes

private static List<PageReadStore> readBlocksFromFile(Path file) throws IOException {
  List<PageReadStore> rowGroups = new ArrayList<PageReadStore>();

  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, file, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader fileReader = new ParquetFileReader(configuration, metadata.getFileMetaData(), file, metadata.getBlocks(),
      metadata.getFileMetaData().getSchema().getColumns());

  PageReadStore group;
  while ((group = fileReader.readNextRowGroup()) != null) {
    rowGroups.add(group);
  }

  return rowGroups;
}

Example #27

Source File: TestMergeMetadataFiles.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testMergeMetadataFiles() throws Exception {
  WrittenFileInfo info = writeFiles(false);

  ParquetMetadata commonMeta1 = ParquetFileReader.readFooter(info.conf, info.commonMetaPath1, ParquetMetadataConverter.NO_FILTER);
  ParquetMetadata commonMeta2 = ParquetFileReader.readFooter(info.conf, info.commonMetaPath2, ParquetMetadataConverter.NO_FILTER);
  ParquetMetadata meta1 = ParquetFileReader.readFooter(info.conf, info.metaPath1, ParquetMetadataConverter.NO_FILTER);
  ParquetMetadata meta2 = ParquetFileReader.readFooter(info.conf, info.metaPath2, ParquetMetadataConverter.NO_FILTER);

  assertTrue(commonMeta1.getBlocks().isEmpty());
  assertTrue(commonMeta2.getBlocks().isEmpty());
  assertEquals(commonMeta1.getFileMetaData().getSchema(), commonMeta2.getFileMetaData().getSchema());

  assertFalse(meta1.getBlocks().isEmpty());
  assertFalse(meta2.getBlocks().isEmpty());
  assertEquals(meta1.getFileMetaData().getSchema(), meta2.getFileMetaData().getSchema());


  assertEquals(commonMeta1.getFileMetaData().getKeyValueMetaData(), commonMeta2.getFileMetaData().getKeyValueMetaData());
  assertEquals(meta1.getFileMetaData().getKeyValueMetaData(), meta2.getFileMetaData().getKeyValueMetaData());

  // test file serialization
  Path mergedOut = new Path(new File(temp.getRoot(), "merged_meta").getAbsolutePath());
  Path mergedCommonOut = new Path(new File(temp.getRoot(), "merged_common_meta").getAbsolutePath());
  ParquetFileWriter.writeMergedMetadataFile(Arrays.asList(info.metaPath1, info.metaPath2), mergedOut, info.conf);
  ParquetFileWriter.writeMergedMetadataFile(Arrays.asList(info.commonMetaPath1, info.commonMetaPath2), mergedCommonOut, info.conf);

  ParquetMetadata mergedMeta = ParquetFileReader.readFooter(info.conf, mergedOut, ParquetMetadataConverter.NO_FILTER);
  ParquetMetadata mergedCommonMeta = ParquetFileReader.readFooter(info.conf, mergedCommonOut, ParquetMetadataConverter.NO_FILTER);

  // ideally we'd assert equality here, but BlockMetaData and it's references don't implement equals
  assertEquals(meta1.getBlocks().size() + meta2.getBlocks().size(), mergedMeta.getBlocks().size());
  assertTrue(mergedCommonMeta.getBlocks().isEmpty());

  assertEquals(meta1.getFileMetaData().getSchema(), mergedMeta.getFileMetaData().getSchema());
  assertEquals(commonMeta1.getFileMetaData().getSchema(), mergedCommonMeta.getFileMetaData().getSchema());

  assertEquals(meta1.getFileMetaData().getKeyValueMetaData(), mergedMeta.getFileMetaData().getKeyValueMetaData());
  assertEquals(commonMeta1.getFileMetaData().getKeyValueMetaData(), mergedCommonMeta.getFileMetaData().getKeyValueMetaData());
}

Example #28

Source File: ParquetReadOptions.java From parquet-mr with Apache License 2.0

5 votes

ParquetReadOptions(boolean useSignedStringMinMax,
                   boolean useStatsFilter,
                   boolean useDictionaryFilter,
                   boolean useRecordFilter,
                   boolean useColumnIndexFilter,
                   boolean usePageChecksumVerification,
                   boolean useBloomFilter,
                   FilterCompat.Filter recordFilter,
                   ParquetMetadataConverter.MetadataFilter metadataFilter,
                   CompressionCodecFactory codecFactory,
                   ByteBufferAllocator allocator,
                   int maxAllocationSize,
                   Map<String, String> properties) {
  this.useSignedStringMinMax = useSignedStringMinMax;
  this.useStatsFilter = useStatsFilter;
  this.useDictionaryFilter = useDictionaryFilter;
  this.useRecordFilter = useRecordFilter;
  this.useColumnIndexFilter = useColumnIndexFilter;
  this.usePageChecksumVerification = usePageChecksumVerification;
  this.useBloomFilter = useBloomFilter;
  this.recordFilter = recordFilter;
  this.metadataFilter = metadataFilter;
  this.codecFactory = codecFactory;
  this.allocator = allocator;
  this.maxAllocationSize = maxAllocationSize;
  this.properties = Collections.unmodifiableMap(properties);
}

Example #29

Source File: LocalDictionariesReader.java From dremio-oss with Apache License 2.0

5 votes

/**
 * Return dictionary per row group for all binary columns in given parquet file.
 * @param fs filesystem object.
 * @param filePath parquet file to scan
 * @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
 * @throws IOException
 */
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CompressionCodecFactory codecFactory) throws IOException {
  // Passing the max footer length is not required in this case as the parquet reader would already have failed.
  final ParquetMetadata parquetMetadata = SingletonParquetFooterCache.readFooter(fs, filePath, ParquetMetadataConverter.NO_FILTER,
    ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  if (parquetMetadata.getBlocks().size() > 1) {
    throw new IOException(
      format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
        parquetMetadata.getBlocks().size(), filePath));
  }
  final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
  final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();

  for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
    columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
  }

  final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
  final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
  try(final FSInputStream in = fs.open(filePath)) {
    for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
      if (isBinaryType(columnChunkMetaData.getType())) {
        final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
        // if first page is dictionary encoded then load dictionary, otherwise skip this column.
        final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
        if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
          dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
        } else {
          columnsToSkip.add(column);
        }
      }
    }
  }
  return new ImmutablePair<>(dictionaries, columnsToSkip);
}

Example #30

Source File: TableSchemaResolver.java From hudi with Apache License 2.0

5 votes

/**
 * Read the parquet schema from a parquet File.
 */
public MessageType readSchemaFromBaseFile(Path parquetFilePath) throws IOException {
  LOG.info("Reading schema from " + parquetFilePath);

  FileSystem fs = metaClient.getRawFs();
  if (!fs.exists(parquetFilePath)) {
    throw new IllegalArgumentException(
        "Failed to read schema from data file " + parquetFilePath + ". File does not exist.");
  }
  ParquetMetadata fileFooter =
      ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER);
  return fileFooter.getFileMetaData().getSchema();
}