Java Code Examples for org.apache.parquet.schema.MessageType#getColumnDescription()

The following examples show how to use org.apache.parquet.schema.MessageType#getColumnDescription() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath) {
  String name = Strings.repeat(".", depth) + type.getName();
  OriginalType otype = type.getOriginalType();
  Repetition rep = type.getRepetition();
  PrimitiveTypeName ptype = type.getPrimitiveTypeName();

  out.format("%s: %s %s", name, rep, ptype);
  if (otype != null) out.format(" O:%s", otype);

  if (container != null) {
    cpath.add(type.getName());
    String[] paths = cpath.toArray(new String[0]);
    cpath.remove(cpath.size() - 1);

    ColumnDescriptor desc = container.getColumnDescription(paths);

    int defl = desc.getMaxDefinitionLevel();
    int repl = desc.getMaxRepetitionLevel();
    out.format(" R:%d D:%d", repl, defl);
  }
  out.println();
}
 
Example 2
Source File: TestMemColumn.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testMemColumn() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message msg { required group foo { required int64 bar; } }");
  ColumnDescriptor path = schema.getColumnDescription(new String[] {"foo", "bar"});
  MemPageStore memPageStore = new MemPageStore(10);
  ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
  ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
  columnWriter.write(42l, 0, 0);
  memColumnsStore.endRecord();
  memColumnsStore.flush();

  ColumnReader columnReader = getColumnReader(memPageStore, path, schema);
  for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
    assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
    assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
    assertEquals(columnReader.getLong(), 42);
    columnReader.consume();
  }
}
 
Example 3
Source File: TestMemColumn.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testMemColumnBinary() throws Exception {
  MessageType mt = MessageTypeParser.parseMessageType("message msg { required group foo { required binary bar; } }");
  String[] col = new String[]{"foo", "bar"};
  MemPageStore memPageStore = new MemPageStore(10);

  ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
  ColumnDescriptor path1 = mt.getColumnDescription(col);
  ColumnDescriptor path = path1;

  ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
  columnWriter.write(Binary.fromString("42"), 0, 0);
  memColumnsStore.endRecord();
  memColumnsStore.flush();

  ColumnReader columnReader = getColumnReader(memPageStore, path, mt);
  for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
    assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
    assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
    assertEquals(columnReader.getBinary().toStringUsingUTF8(), "42");
    columnReader.consume();
  }
}
 
Example 4
Source File: TestMemColumn.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testMemColumnSeveralPages() throws Exception {
  MessageType mt = MessageTypeParser.parseMessageType("message msg { required group foo { required int64 bar; } }");
  String[] col = new String[]{"foo", "bar"};
  MemPageStore memPageStore = new MemPageStore(10);
  ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
  ColumnDescriptor path1 = mt.getColumnDescription(col);
  ColumnDescriptor path = path1;

  ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
  for (int i = 0; i < 2000; i++) {
    columnWriter.write(42l, 0, 0);
    memColumnsStore.endRecord();
  }
  memColumnsStore.flush();

  ColumnReader columnReader = getColumnReader(memPageStore, path, mt);
  for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
    assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
    assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
    assertEquals(columnReader.getLong(), 42);
    columnReader.consume();
  }
}
 
Example 5
Source File: PrintFooter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void add(ParquetMetadata footer) {
  for (BlockMetaData blockMetaData : footer.getBlocks()) {
    ++ blockCount;
    MessageType schema = footer.getFileMetaData().getSchema();
    recordCount += blockMetaData.getRowCount();
    List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
      add(
          desc,
          columnMetaData.getValueCount(),
          columnMetaData.getTotalSize(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getEncodings(),
          columnMetaData.getStatistics());
    }
  }
}
 
Example 6
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath, boolean showOriginalTypes) {
  String name = Strings.repeat(".", depth) + type.getName();
  Repetition rep = type.getRepetition();
  PrimitiveTypeName ptype = type.getPrimitiveTypeName();

  out.format("%s: %s %s", name, rep, ptype);
  if (showOriginalTypes) {
    OriginalType otype;
    try {
      otype = type.getOriginalType();
    } catch (Exception e) {
      otype = null;
    }
    if (otype != null) out.format(" O:%s", otype);
  } else {
    LogicalTypeAnnotation ltype = type.getLogicalTypeAnnotation();
    if (ltype != null) out.format(" L:%s", ltype);
  }

  if (container != null) {
    cpath.add(type.getName());
    String[] paths = cpath.toArray(new String[0]);
    cpath.remove(cpath.size() - 1);

    ColumnDescriptor desc = container.getColumnDescription(paths);

    int defl = desc.getMaxDefinitionLevel();
    int repl = desc.getMaxRepetitionLevel();
    out.format(" R:%d D:%d", repl, defl);
  }
  out.println();
}
 
Example 7
Source File: TestInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void createParquetFile(File file) throws IOException {
  Path path = new Path(file.toURI());
  Configuration configuration = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;}}");
  String[] columnPath = {"a", "b"};
  ColumnDescriptor c1 = schema.getColumnDescription(columnPath);

  byte[] bytes1 = { 0, 1, 2, 3};
  byte[] bytes2 = { 2, 3, 4, 5};
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  BinaryStatistics stats = new BinaryStatistics();

  ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
  w.start();
  w.startBlock(3);
  w.startColumn(c1, 5, codec);
  w.writeDataPage(2, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(3, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();
  w.startBlock(4);
  w.startColumn(c1, 7, codec);
  w.writeDataPage(7, 4, BytesInput.from(bytes2), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();
  w.end(new HashMap<String, String>());
}
 
Example 8
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testBloomFilterWriteRead() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
  File testFile = temp.newFile();
  testFile.delete();
  Path path = new Path(testFile.toURI());
  Configuration configuration = new Configuration();
  configuration.set("parquet.bloom.filter.column.names", "foo");
  String[] colPath = {"foo"};
  ColumnDescriptor col = schema.getColumnDescription(colPath);
  BinaryStatistics stats1 = new BinaryStatistics();
  ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
  w.start();
  w.startBlock(3);
  w.startColumn(col, 5, CODEC);
  w.writeDataPage(2, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(3, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  BloomFilter blockSplitBloomFilter = new BlockSplitBloomFilter(0);
  blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("hello")));
  blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("world")));
  w.addBloomFilter("foo", blockSplitBloomFilter);
  w.endBlock();
  w.end(new HashMap<>());
  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
  ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path,
    Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(colPath)));
  BloomFilterReader bloomFilterReader = r.getBloomFilterDataReader(readFooter.getBlocks().get(0));
  BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(readFooter.getBlocks().get(0).getColumns().get(0));
  assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("hello"))));
  assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("world"))));
}
 
Example 9
Source File: ParquetMetadataCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) {
  String[] path = column.getPath().toArray();
  PrimitiveType type = primitive(schema, path);
  Preconditions.checkNotNull(type);

  ColumnDescriptor desc = schema.getColumnDescription(path);
  long size = column.getTotalSize();
  long count = column.getValueCount();
  float perValue = ((float) size) / count;
  CompressionCodecName codec = column.getCodec();
  Set<Encoding> encodings = column.getEncodings();
  EncodingStats encodingStats = column.getEncodingStats();
  String encodingSummary = encodingStats == null ?
      encodingsAsString(encodings, desc) :
      encodingStatsAsString(encodingStats);
  Statistics stats = column.getStatistics();

  String name = column.getPath().toDotString();

  PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName();
  if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
    console.info(String.format("%-" + width + "s  FIXED[%d] %s %-7s %-9d %-8s %-7s %s",
        name, type.getTypeLength(), shortCodec(codec), encodingSummary, count,
        humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
        minMaxAsString(stats)));
  } else {
    console.info(String.format("%-" + width + "s  %-9s %s %-7s %-9d %-10s %-7s %s",
        name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue),
        stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()),
        minMaxAsString(stats)));
  }
}
 
Example 10
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void testWriteReadStatistics() throws Exception {
  // this test assumes statistics will be read
  Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));

  File testFile = temp.newFile();
  testFile.delete();

  Path path = new Path(testFile.toURI());
  Configuration configuration = new Configuration();
  configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);

  MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b (UTF8);} required group c { required int64 d; }}");
  String[] path1 = {"a", "b"};
  ColumnDescriptor c1 = schema.getColumnDescription(path1);
  String[] path2 = {"c", "d"};
  ColumnDescriptor c2 = schema.getColumnDescription(path2);

  byte[] bytes1 = { 0, 1, 2, 3};
  byte[] bytes2 = { 1, 2, 3, 4};
  byte[] bytes3 = { 2, 3, 4, 5};
  byte[] bytes4 = { 3, 4, 5, 6};
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  BinaryStatistics statsB1C1P1 = new BinaryStatistics();
  BinaryStatistics statsB1C1P2 = new BinaryStatistics();
  LongStatistics statsB1C2P1 = new LongStatistics();
  LongStatistics statsB1C2P2 = new LongStatistics();
  BinaryStatistics statsB2C1P1 = new BinaryStatistics();
  LongStatistics statsB2C2P1 = new LongStatistics();
  statsB1C1P1.setMinMax(Binary.fromString("s"), Binary.fromString("z"));
  statsB1C1P2.setMinMax(Binary.fromString("a"), Binary.fromString("b"));
  statsB1C2P1.setMinMax(2l, 10l);
  statsB1C2P2.setMinMax(-6l, 4l);
  statsB2C1P1.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
  statsB2C2P1.setMinMax(11l, 122l);

  ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
  w.start();
  w.startBlock(3);
  w.startColumn(c1, 5, codec);
  w.writeDataPage(2, 4, BytesInput.from(bytes1), statsB1C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(3, 4, BytesInput.from(bytes1), statsB1C1P2, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.startColumn(c2, 6, codec);
  w.writeDataPage(3, 4, BytesInput.from(bytes2), statsB1C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(1, 4, BytesInput.from(bytes2), statsB1C2P2, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();

  w.startBlock(4);
  w.startColumn(c1, 7, codec);
  w.writeDataPage(7, 4, BytesInput.from(bytes3), statsB2C1P1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.startColumn(c2, 8, codec);
  w.writeDataPage(8, 4, BytesInput.from(bytes4), statsB2C2P1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();
  w.end(new HashMap<String, String>());

  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
  for (BlockMetaData block : readFooter.getBlocks()) {
    for (ColumnChunkMetaData col : block.getColumns()) {
      col.getPath();
    }
  }
  // correct statistics
  BinaryStatistics bs1 = new BinaryStatistics();
  bs1.setMinMax(Binary.fromString("a"), Binary.fromString("z"));
  LongStatistics ls1 = new LongStatistics();
  ls1.setMinMax(-6l, 10l);

  BinaryStatistics bs2 = new BinaryStatistics();
  bs2.setMinMax(Binary.fromString("d"), Binary.fromString("e"));
  LongStatistics ls2 = new LongStatistics();
  ls2.setMinMax(11l, 122l);

  { // assert stats are correct for the first block
    BinaryStatistics bsout = (BinaryStatistics)readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
    String str = new String(bsout.getMaxBytes());
    String str2 = new String(bsout.getMinBytes());

    TestUtils.assertStatsValuesEqual(bs1, readFooter.getBlocks().get(0).getColumns().get(0).getStatistics());
    TestUtils.assertStatsValuesEqual(ls1, readFooter.getBlocks().get(0).getColumns().get(1).getStatistics());
  }
  { // assert stats are correct for the second block
    TestUtils.assertStatsValuesEqual(bs2, readFooter.getBlocks().get(1).getColumns().get(0).getStatistics());
    TestUtils.assertStatsValuesEqual(ls2, readFooter.getBlocks().get(1).getColumns().get(1).getStatistics());
  }
}
 
Example 11
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void createFile(Configuration configuration, Path path, MessageType schema) throws IOException {
  String[] path1 = {"a", "b"};
  ColumnDescriptor c1 = schema.getColumnDescription(path1);
  String[] path2 = {"c", "d"};
  ColumnDescriptor c2 = schema.getColumnDescription(path2);

  byte[] bytes1 = { 0, 1, 2, 3};
  byte[] bytes2 = { 1, 2, 3, 4};
  byte[] bytes3 = { 2, 3, 4, 5};
  byte[] bytes4 = { 3, 4, 5, 6};
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  BinaryStatistics stats1 = new BinaryStatistics();
  BinaryStatistics stats2 = new BinaryStatistics();

  ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
  w.start();
  w.startBlock(3);
  w.startColumn(c1, 5, codec);
  w.writeDataPage(2, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(3, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.startColumn(c2, 6, codec);
  w.writeDataPage(2, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(3, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(1, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();
  w.startBlock(4);
  w.startColumn(c1, 7, codec);
  w.writeDataPage(7, 4, BytesInput.from(bytes3), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.startColumn(c2, 8, codec);
  w.writeDataPage(8, 4, BytesInput.from(bytes4), stats2, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();
  final HashMap<String, String> extraMetaData = new HashMap<String, String>();
  extraMetaData.put("foo", "bar");
  extraMetaData.put(path.getName(), path.getName());
  w.end(extraMetaData);
}
 
Example 12
Source File: Util.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public static ColumnDescriptor descriptor(String column, MessageType schema) {
  String[] path = Iterables.toArray(DOT.split(column), String.class);
  Preconditions.checkArgument(schema.containsPath(path),
      "Schema doesn't have column: " + column);
  return schema.getColumnDescription(path);
}