Java Code Examples for org.apache.parquet.hadoop.metadata.ParquetMetadata#getBlocks()

The following examples show how to use org.apache.parquet.hadoop.metadata.ParquetMetadata#getBlocks() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ParquetReaderUtility.java From dremio-oss with Apache License 2.0

6 votes

/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
public static List<Integer> getRowGroupNumbersFromFileSplit(final long splitStart, final long splitLength,
                                                             final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();
  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}

Example 2

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

6 votes

@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}

Example 3

Source File: PrintFooter.java From parquet-mr with Apache License 2.0

6 votes

private static void add(ParquetMetadata footer) {
  for (BlockMetaData blockMetaData : footer.getBlocks()) {
    ++ blockCount;
    MessageType schema = footer.getFileMetaData().getSchema();
    recordCount += blockMetaData.getRowCount();
    List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
      add(
          desc,
          columnMetaData.getValueCount(),
          columnMetaData.getTotalSize(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getEncodings(),
          columnMetaData.getStatistics());
    }
  }
}

Example 4

Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0

6 votes

public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
  List<BlockMetaData> blocks = parquetMetadata.getBlocks();
  List<RowGroup> rowGroups = new ArrayList<RowGroup>();
  long numRows = 0;
  for (BlockMetaData block : blocks) {
    numRows += block.getRowCount();
    addRowGroup(parquetMetadata, rowGroups, block);
  }
  FileMetaData fileMetaData = new FileMetaData(
      currentVersion,
      toParquetSchema(parquetMetadata.getFileMetaData().getSchema()),
      numRows,
      rowGroups);

  Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
  for (Entry<String, String> keyValue : keyValues) {
    addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
  }

  fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());

  fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema()));

  return fileMetaData;
}

Example 5

Source File: TestParquetWriterAppendBlocks.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testMergedMetadata() throws IOException {
  Path combinedFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, FILE_SCHEMA, combinedFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  ParquetMetadata combinedFooter = ParquetFileReader.readFooter(
      CONF, combinedFile, NO_FILTER);
  ParquetMetadata f1Footer = ParquetFileReader.readFooter(
      CONF, file1, NO_FILTER);
  ParquetMetadata f2Footer = ParquetFileReader.readFooter(
      CONF, file2, NO_FILTER);

  LinkedList<BlockMetaData> expectedRowGroups = new LinkedList<BlockMetaData>();
  expectedRowGroups.addAll(f1Footer.getBlocks());
  expectedRowGroups.addAll(f2Footer.getBlocks());

  Assert.assertEquals("Combined should have the right number of row groups",
      expectedRowGroups.size(),
      combinedFooter.getBlocks().size());

  long nextStart = 4;
  for (BlockMetaData rowGroup : combinedFooter.getBlocks()) {
    BlockMetaData expected = expectedRowGroups.removeFirst();
    Assert.assertEquals("Row count should match",
        expected.getRowCount(), rowGroup.getRowCount());
    Assert.assertEquals("Compressed size should match",
        expected.getCompressedSize(), rowGroup.getCompressedSize());
    Assert.assertEquals("Total size should match",
        expected.getTotalByteSize(), rowGroup.getTotalByteSize());
    Assert.assertEquals("Start pos should be at the last row group's end",
        nextStart, rowGroup.getStartingPos());
    assertColumnsEquivalent(expected.getColumns(), rowGroup.getColumns());
    nextStart = rowGroup.getStartingPos() + rowGroup.getTotalByteSize();
  }
}

Example 6

Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0

5 votes

private String check(String file) throws IOException {
  Path path = qualifiedPath(file);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), path, ParquetMetadataConverter.NO_FILTER);

  FileMetaData meta = footer.getFileMetaData();
  String createdBy = meta.getCreatedBy();
  if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
    // create fake metadata that will read corrupt stats and return them
    FileMetaData fakeMeta = new FileMetaData(
        meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);

    // get just the binary columns
    List<ColumnDescriptor> columns = Lists.newArrayList();
    Iterables.addAll(columns, Iterables.filter(
        meta.getSchema().getColumns(),
        new Predicate<ColumnDescriptor>() {
          @Override
          public boolean apply(@Nullable ColumnDescriptor input) {
            return input != null && input.getType() == BINARY;
          }
        }));

    // now check to see if the data is actually corrupt
    ParquetFileReader reader = new ParquetFileReader(getConf(),
        fakeMeta, path, footer.getBlocks(), columns);

    try {
      PageStatsValidator validator = new PageStatsValidator();
      for (PageReadStore pages = reader.readNextRowGroup(); pages != null;
           pages = reader.readNextRowGroup()) {
        validator.validate(columns, pages);
      }
    } catch (BadStatsException e) {
      return e.getMessage();
    }
  }

  return null;
}

Example 7

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

5 votes

private static List<PageReadStore> readBlocksFromFile(Path file) throws IOException {
  List<PageReadStore> rowGroups = new ArrayList<PageReadStore>();

  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, file, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader fileReader = new ParquetFileReader(configuration, metadata.getFileMetaData(), file, metadata.getBlocks(),
      metadata.getFileMetaData().getSchema().getColumns());

  PageReadStore group;
  while ((group = fileReader.readNextRowGroup()) != null) {
    rowGroups.add(group);
  }

  return rowGroups;
}

Example 8

Source File: TestDataPageV1Checksums.java From parquet-mr with Apache License 2.0

5 votes

/** Construct ParquetFileReader for input file and columns */
private ParquetFileReader getParquetFileReader(Path path, Configuration conf,
                                               List<ColumnDescriptor> columns)
  throws IOException {
  ParquetMetadata footer = ParquetFileReader.readFooter(conf, path);
  return new ParquetFileReader(conf, footer.getFileMetaData(), path,
    footer.getBlocks(), columns);
}

Example 9

Source File: TestConvertAvroToParquet.java From nifi with Apache License 2.0

5 votes

@Test
public void test_Meta_Info() throws Exception {

    FileInputStream fileInputStream = new FileInputStream(tmpAvro);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    int readedBytes;
    byte[] buf = new byte[1024];
    while ((readedBytes = fileInputStream.read(buf)) > 0) {
        out.write(buf, 0, readedBytes);
    }
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToParquet.SUCCESS).get(0);

    // Save the flowfile
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream(tmpParquet);
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    ParquetMetadata metaData;
    metaData = ParquetFileReader.readFooter(conf, new Path(tmpParquet.getAbsolutePath()), NO_FILTER);

    // #number of records
    long nParquetRecords = 0;
    for(BlockMetaData meta : metaData.getBlocks()){
        nParquetRecords += meta.getRowCount();
    }
    long nAvroRecord = records.size();

    assertEquals(nParquetRecords, nAvroRecord);
}

Example 10

Source File: TestParquetWriterAppendBlocks.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testAllowDroppingColumns() throws IOException {
  MessageType droppedColumnSchema = Types.buildMessage()
      .required(BINARY).as(UTF8).named("string")
      .named("AppendTest");

  Path droppedColumnFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, droppedColumnSchema, droppedColumnFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  LinkedList<Group> expected = new LinkedList<Group>();
  expected.addAll(file1content);
  expected.addAll(file2content);

  ParquetMetadata footer = ParquetFileReader.readFooter(
      CONF, droppedColumnFile, NO_FILTER);
  for (BlockMetaData rowGroup : footer.getBlocks()) {
    Assert.assertEquals("Should have only the string column",
        1, rowGroup.getColumns().size());
  }

  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), droppedColumnFile)
      .build();

  Group next;
  while ((next = reader.read()) != null) {
    Group expectedNext = expected.removeFirst();
    Assert.assertEquals("Each string should match",
        expectedNext.getString("string", 0), next.getString("string", 0));
  }

  Assert.assertEquals("All records should be present", 0, expected.size());
}

Example 11

Source File: ParquetUtil.java From iceberg with Apache License 2.0

5 votes

/**
 * @return a list of offsets in ascending order determined by the starting position
 * of the row groups
 */
public static List<Long> getSplitOffsets(ParquetMetadata md) {
  List<Long> splitOffsets = new ArrayList<>(md.getBlocks().size());
  for (BlockMetaData blockMetaData : md.getBlocks()) {
    splitOffsets.add(blockMetaData.getStartingPos());
  }
  Collections.sort(splitOffsets);
  return splitOffsets;
}

Example 12

Source File: ColumnSizeCommand.java From parquet-mr with Apache License 2.0

5 votes

public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException {
  Map<String, Long> colSizes = new HashMap<>();
  ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER);

  for (BlockMetaData block : pmd.getBlocks()) {
    for (ColumnChunkMetaData column : block.getColumns()) {
      String colName = column.getPath().toDotString();
      colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L));
    }
  }

  return colSizes;
}

Example 13

Source File: MetadataUtils.java From parquet-mr with Apache License 2.0

5 votes

static void showDetails(PrettyPrintWriter out, ParquetMetadata meta, boolean showOriginalTypes) {
  showDetails(out, meta.getFileMetaData(), showOriginalTypes);

  long i = 1;
  for (BlockMetaData bmeta : meta.getBlocks()) {
    out.println();
    showDetails(out, bmeta, i++);
  }
}

Example 14

Source File: ColumnSizeCommand.java From parquet-mr with Apache License 2.0

5 votes

public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException {
  Map<String, Long> colSizes = new HashMap<>();
  ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER);

  for (BlockMetaData block : pmd.getBlocks()) {
    for (ColumnChunkMetaData column : block.getColumns()) {
      String colName = column.getPath().toDotString();
      colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L));
    }
  }

  return colSizes;
}

Example 15

Source File: UnifiedParquetReader.java From dremio-oss with Apache License 2.0

5 votes

@Override
public List<RecordReader> getReaders(final UnifiedParquetReader unifiedReader) throws ExecutionSetupException {
  final ParquetMetadata footer = unifiedReader.getFooter();
  final List<BlockMetaData> blocks = footer.getBlocks();
  final int rowGroupIdx = unifiedReader.readEntry.getRowGroupIndex();
  if (blocks.size() <= rowGroupIdx) {
    throw new IllegalArgumentException(
        String.format("Invalid rowgroup index in read entry. Given '%d', Max '%d'", rowGroupIdx, blocks.size())
    );
  }

  final long rowCount = blocks.get(rowGroupIdx).getRowCount();

  final RecordReader reader = new AbstractRecordReader(unifiedReader.context, Collections.<SchemaPath>emptyList()) {
    private long remainingRowCount = rowCount;

    @Override
    public void setup(OutputMutator output) throws ExecutionSetupException {

    }

    @Override
    public int next() {
      if (numRowsPerBatch > remainingRowCount) {
        int toReturn = (int) remainingRowCount;
        remainingRowCount = 0;
        return toReturn;
      }

      remainingRowCount -= numRowsPerBatch;
      return (int)numRowsPerBatch;
    }

    @Override
    public void close() throws Exception {

    }
  };
  return Collections.singletonList(reader);
}

Example 16

Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0

4 votes

List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers,
    long maxSplitSize, long minSplitSize, ReadContext readContext)
    throws IOException {
  List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
  Filter filter = ParquetInputFormat.getFilter(configuration);

  long rowGroupsDropped = 0;
  long totalRowGroups = 0;

  for (Footer footer : footers) {
    final Path file = footer.getFile();
    LOG.debug("{}", file);
    FileSystem fs = file.getFileSystem(configuration);
    FileStatus fileStatus = fs.getFileStatus(file);
    ParquetMetadata parquetMetaData = footer.getParquetMetadata();
    List<BlockMetaData> blocks = parquetMetaData.getBlocks();

    List<BlockMetaData> filteredBlocks;

    totalRowGroups += blocks.size();
    filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema());
    rowGroupsDropped += blocks.size() - filteredBlocks.size();

    if (filteredBlocks.isEmpty()) {
      continue;
    }

    BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
    splits.addAll(
        generateSplits(
            filteredBlocks,
            fileBlockLocations,
            fileStatus,
            readContext.getRequestedSchema().toString(),
            readContext.getReadSupportMetadata(),
            minSplitSize,
            maxSplitSize)
        );
  }

  if (rowGroupsDropped > 0 && totalRowGroups > 0) {
    int percentDropped = (int) ((((double) rowGroupsDropped) / totalRowGroups) * 100);
    LOG.info("Dropping {} row groups that do not pass filter predicate! ({}%)", rowGroupsDropped, percentDropped);
  } else {
    LOG.info("There were no row groups that could be dropped due to filter predicates");
  }
  return splits;
}

Example 17

Source File: TestInputOutputFormatWithPadding.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void testBasicBehaviorWithPadding() throws Exception {
  HadoopOutputFile.getBlockFileSystems().add("file");

  File inputFile = temp.newFile();
  FileOutputStream out = new FileOutputStream(inputFile);
  out.write(FILE_CONTENT.getBytes("UTF-8"));
  out.close();

  File tempFolder = temp.newFolder();
  tempFolder.delete();
  Path tempPath = new Path(tempFolder.toURI());

  File outputFolder = temp.newFile();
  outputFolder.delete();

  Configuration conf = new Configuration();
  // May test against multiple hadoop versions
  conf.set("dfs.block.size", "1024");
  conf.set("dfs.blocksize", "1024");
  conf.set("dfs.blockSize", "1024");
  conf.set("fs.local.block.size", "1024");

  // don't use a cached FS with a different block size
  conf.set("fs.file.impl.disable.cache", "true");

  // disable summary metadata, it isn't needed
  conf.set("parquet.enable.summary-metadata", "false");
  conf.set("parquet.example.schema", PARQUET_TYPE.toString());

  {
    Job writeJob = new Job(conf, "write");
    writeJob.setInputFormatClass(TextInputFormat.class);
    TextInputFormat.addInputPath(writeJob, new Path(inputFile.toString()));

    writeJob.setOutputFormatClass(ParquetOutputFormat.class);
    writeJob.setMapperClass(Writer.class);
    writeJob.setNumReduceTasks(0); // write directly to Parquet without reduce
    ParquetOutputFormat.setWriteSupportClass(writeJob, GroupWriteSupport.class);
    ParquetOutputFormat.setBlockSize(writeJob, 1024);
    ParquetOutputFormat.setPageSize(writeJob, 512);
    ParquetOutputFormat.setDictionaryPageSize(writeJob, 512);
    ParquetOutputFormat.setEnableDictionary(writeJob, true);
    ParquetOutputFormat.setMaxPaddingSize(writeJob, 1023); // always pad
    ParquetOutputFormat.setOutputPath(writeJob, tempPath);

    waitForJob(writeJob);
  }

  // make sure padding was added
  File parquetFile = getDataFile(tempFolder);
  ParquetMetadata footer = ParquetFileReader.readFooter(conf,
      new Path(parquetFile.toString()), ParquetMetadataConverter.NO_FILTER);
  for (BlockMetaData block : footer.getBlocks()) {
    Assert.assertTrue("Block should start at a multiple of the block size",
        block.getStartingPos() % 1024 == 0);
  }

  {
    Job readJob = new Job(conf, "read");
    readJob.setInputFormatClass(NoSplits.class);
    ParquetInputFormat.setReadSupportClass(readJob, GroupReadSupport.class);
    TextInputFormat.addInputPath(readJob, tempPath);

    readJob.setOutputFormatClass(TextOutputFormat.class);
    readJob.setMapperClass(Reader.class);
    readJob.setNumReduceTasks(0); // write directly to text without reduce
    TextOutputFormat.setOutputPath(readJob, new Path(outputFolder.toString()));

    waitForJob(readJob);
  }

  File dataFile = getDataFile(outputFolder);
  Assert.assertNotNull("Should find a data file", dataFile);

  StringBuilder contentBuilder = new StringBuilder();
  for (String line : Files.readAllLines(dataFile.toPath(), StandardCharsets.UTF_8)) {
    contentBuilder.append(line);
  }
  String reconstructed = contentBuilder.toString();
  Assert.assertEquals("Should match written file content",
      FILE_CONTENT, reconstructed);

  HadoopOutputFile.getBlockFileSystems().remove("file");
}

Example 18

Source File: TestParquetWriterNewPage.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  Path root = new Path("target/tests/TestParquetWriter/");
  FileSystem fs = root.getFileSystem(conf);
  if (fs.exists(root)) {
    fs.delete(root, true);
  }
  fs.mkdirs(root);
  MessageType schema = parseMessageType(
      "message test { "
      + "required binary binary_field; "
      + "required int32 int32_field; "
      + "required int64 int64_field; "
      + "required boolean boolean_field; "
      + "required float float_field; "
      + "required double double_field; "
      + "required fixed_len_byte_array(3) flba_field; "
      + "required int96 int96_field; "
      + "optional binary null_field; "
      + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  Map<String, Encoding> expected = new HashMap<String, Encoding>();
  expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
  expected.put("1000-" + PARQUET_1_0, PLAIN);
  expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
  expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
  for (int modulo : asList(10, 1000)) {
    for (WriterVersion version : WriterVersion.values()) {
      Path file = new Path(root, version.name() + "_" + modulo);
      ParquetWriter<Group> writer = new ParquetWriter<Group>(
          file,
          new GroupWriteSupport(),
          UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
      for (int i = 0; i < 1000; i++) {
        writer.write(
            f.newGroup()
            .append("binary_field", "test" + (i % modulo))
            .append("int32_field", 32)
            .append("int64_field", 64l)
            .append("boolean_field", true)
            .append("float_field", 1.0f)
            .append("double_field", 2.0d)
            .append("flba_field", "foo")
            .append("int96_field", Binary.fromConstantByteArray(new byte[12])));
      }
      writer.close();

      ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
      for (int i = 0; i < 1000; i++) {
        Group group = reader.read();
        assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
        assertEquals(32, group.getInteger("int32_field", 0));
        assertEquals(64l, group.getLong("int64_field", 0));
        assertEquals(true, group.getBoolean("boolean_field", 0));
        assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
        assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
        assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
        assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field",
            0));
        assertEquals(0, group.getFieldRepetitionCount("null_field"));
      }
      reader.close();
      ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
      for (BlockMetaData blockMetaData : footer.getBlocks()) {
        for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
          if (column.getPath().toDotString().equals("binary_field")) {
            String key = modulo + "-" + version;
            Encoding expectedEncoding = expected.get(key);
            assertTrue(
                key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
                column.getEncodings().contains(expectedEncoding));
          }
        }
      }
    }
  }
}

Example 19

Source File: TestParquetWriter.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  Path root = new Path("target/tests/TestParquetWriter/");
  enforceEmptyDir(conf, root);
  MessageType schema = parseMessageType(
      "message test { "
      + "required binary binary_field; "
      + "required int32 int32_field; "
      + "required int64 int64_field; "
      + "required boolean boolean_field; "
      + "required float float_field; "
      + "required double double_field; "
      + "required fixed_len_byte_array(3) flba_field; "
      + "required int96 int96_field; "
      + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  Map<String, Encoding> expected = new HashMap<String, Encoding>();
  expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
  expected.put("1000-" + PARQUET_1_0, PLAIN);
  expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
  expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
  for (int modulo : asList(10, 1000)) {
    for (WriterVersion version : WriterVersion.values()) {
      Path file = new Path(root, version.name() + "_" + modulo);
      ParquetWriter<Group> writer = new ParquetWriter<Group>(
          file,
          new GroupWriteSupport(),
          UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
      for (int i = 0; i < 1000; i++) {
        writer.write(
            f.newGroup()
            .append("binary_field", "test" + (i % modulo))
            .append("int32_field", 32)
            .append("int64_field", 64l)
            .append("boolean_field", true)
            .append("float_field", 1.0f)
            .append("double_field", 2.0d)
            .append("flba_field", "foo")
            .append("int96_field", Binary.fromConstantByteArray(new byte[12])));
      }
      writer.close();
      ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
      for (int i = 0; i < 1000; i++) {
        Group group = reader.read();
        assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
        assertEquals(32, group.getInteger("int32_field", 0));
        assertEquals(64l, group.getLong("int64_field", 0));
        assertEquals(true, group.getBoolean("boolean_field", 0));
        assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
        assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
        assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
        assertEquals(Binary.fromConstantByteArray(new byte[12]),
            group.getInt96("int96_field",0));
      }
      reader.close();
      ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
      for (BlockMetaData blockMetaData : footer.getBlocks()) {
        for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
          if (column.getPath().toDotString().equals("binary_field")) {
            String key = modulo + "-" + version;
            Encoding expectedEncoding = expected.get(key);
            assertTrue(
                key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
                column.getEncodings().contains(expectedEncoding));
          }
        }
      }
      assertEquals("Object model property should be example",
          "example", footer.getFileMetaData().getKeyValueMetaData()
              .get(ParquetWriter.OBJECT_MODEL_NAME_PROP));
    }
  }
}

Example 20

Source File: Metadata.java From dremio-oss with Apache License 2.0

4 votes

private ParquetFileMetadata getParquetFileMetadata(FileAttributes file, AtomicInteger currentNumSplits, long maxSplits) throws IOException {
  final ParquetMetadata metadata =
    SingletonParquetFooterCache.readFooter(fs, file, ParquetMetadataConverter.NO_FILTER, maxFooterLength);
  final int numSplits = currentNumSplits.addAndGet(metadata.getBlocks().size());
  if (numSplits > maxSplits) {
    throw new TooManySplitsException(
      String.format("Too many splits encountered when processing parquet metadata at file %s, maximum is %d but encountered %d splits thus far.",
        file.getPath(), maxSplits, numSplits));
  }

  final MessageType schema = metadata.getFileMetaData().getSchema();

  Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
  schema.getPaths();
  for (String[] path : schema.getPaths()) {
    originalTypeMap.put(SchemaPath.getCompoundPath(path), getOriginalType(schema, path, 0));
  }

  List<RowGroupMetadata> rowGroupMetadataList = Lists.newArrayList();

  ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
  ALL_COLS.add(AbstractRecordReader.STAR_COLUMN);
  boolean autoCorrectCorruptDates = formatConfig.autoCorrectCorruptDates;
  ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
  if(logger.isDebugEnabled()){
    logger.debug(containsCorruptDates.toString());
  }
  final Map<ColumnTypeMetadata.Key, ColumnTypeMetadata> columnTypeInfo = Maps.newHashMap();
  int rowGroupIdx = 0;
  for (BlockMetaData rowGroup : metadata.getBlocks()) {
    List<ColumnMetadata> columnMetadataList = Lists.newArrayList();
    long length = 0;
    for (ColumnChunkMetaData col : rowGroup.getColumns()) {
      ColumnMetadata columnMetadata;

      // statistics might just have the non-null counts with no min/max they might be
      // initialized to zero instead of null.
      // check statistics actually have non null values (or) column has all nulls.
      boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty()
        && (col.getStatistics().hasNonNullValue()) || col.getStatistics().getNumNulls() ==
        rowGroup.getRowCount());

      Statistics<?> stats = col.getStatistics();
      String[] columnName = col.getPath().toArray();
      SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
      ColumnTypeMetadata columnTypeMetadata =
          new ColumnTypeMetadata(columnName, col.getType(), originalTypeMap.get(columnSchemaName));

      columnTypeInfo.put(new ColumnTypeMetadata.Key(columnTypeMetadata.name), columnTypeMetadata);
      if (statsAvailable) {
        // Write stats only if minVal==maxVal. Also, we then store only maxVal
        Object mxValue = null;
        if (stats.genericGetMax() != null && stats.genericGetMin() != null &&
            stats.genericGetMax().equals(stats.genericGetMin())) {
          mxValue = stats.genericGetMax();
          if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION
              && columnTypeMetadata.originalType == OriginalType.DATE) {
            mxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) mxValue);
          }
        }
        columnMetadata =
            new ColumnMetadata(columnTypeMetadata.name, mxValue, stats.getNumNulls());
      } else {
        // log it under trace to avoid lot of log entries.
        logger.trace("Stats are not available for column {}, rowGroupIdx {}, file {}",
            columnSchemaName, rowGroupIdx, file.getPath());
        columnMetadata = new ColumnMetadata(columnTypeMetadata.name,null, null);
      }
      columnMetadataList.add(columnMetadata);
      length += col.getTotalSize();
    }

    RowGroupMetadata rowGroupMeta =
        new RowGroupMetadata(rowGroup.getStartingPos(), length, rowGroup.getRowCount(),
            getHostAffinity(fs, file, rowGroup.getStartingPos(), length), columnMetadataList);

    rowGroupMetadataList.add(rowGroupMeta);
    rowGroupIdx++;
  }

  return new ParquetFileMetadata(file, file.size(), rowGroupMetadataList, columnTypeInfo);
}