Java Code Examples for org.apache.parquet.schema.MessageTypeParser#parseMessageType()

The following examples show how to use org.apache.parquet.schema.MessageTypeParser#parseMessageType() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testOneOfEach() {
  MessageType oneOfEachSchema = MessageTypeParser.parseMessageType(oneOfEach);
  GroupFactory gf = new SimpleGroupFactory(oneOfEachSchema);
  Group g1 = gf.newGroup()
      .append("a", 1l)
      .append("b", 2)
      .append("c", 3.0f)
      .append("d", 4.0d)
      .append("e", true)
      .append("f", Binary.fromString("6"))
      .append("g", new NanoTime(1234, System.currentTimeMillis() * 1000))
      .append("h", Binary.fromString("abc"));

  testSchema(oneOfEachSchema, Arrays.asList(g1));
}
 
Example 2
Source File: SqlInterpreterTest.java    From zeppelin with Apache License 2.0 6 votes vote down vote up
public File createParquetFile(int[] values,
                              ParquetProperties.WriterVersion version) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".par");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType(
          "message test { "
                  + "required int32 int32_field; "
                  + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(
          path,
          new GroupWriteSupport(),
          CompressionCodecName.UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
  for (int i = 0; i < values.length; i++) {
    writer.write(f.newGroup()
            .append("int32_field", values[i]));
  }
  writer.close();
  return file;
}
 
Example 3
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteMode() throws Exception {
  File testFile = temp.newFile();
  MessageType schema = MessageTypeParser.parseMessageType(
      "message m { required group a {required binary b;} required group "
      + "c { required int64 d; }}");
  Configuration conf = new Configuration();

  ParquetFileWriter writer = null;
  boolean exceptionThrown = false;
  Path path = new Path(testFile.toURI());
  try {
    writer = new ParquetFileWriter(conf, schema, path,
        ParquetFileWriter.Mode.CREATE);
  } catch(IOException ioe1) {
    exceptionThrown = true;
  }
  assertTrue(exceptionThrown);
  exceptionThrown = false;
  try {
    writer = new ParquetFileWriter(conf, schema, path,
        OVERWRITE);
  } catch(IOException ioe2) {
    exceptionThrown = true;
  }
  assertTrue(!exceptionThrown);
  testFile.delete();
}
 
Example 4
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testRequiredOfRequired() {
  MessageType reqreqSchema = MessageTypeParser.parseMessageType(
        "message Document {\n"
      + "  required group foo {\n"
      + "    required int64 bar;\n"
      + "  }\n"
      + "}\n");

  GroupFactory gf = new SimpleGroupFactory(reqreqSchema);
  Group g1 = gf.newGroup();
  g1.addGroup("foo").append("bar", 2l);

  testSchema(reqreqSchema, Arrays.asList(g1));
}
 
Example 5
Source File: TestColumnReaderImpl.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testOptional() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { optional binary foo; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  MemPageWriter pageWriter = new MemPageWriter();
  ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter,
      ParquetProperties.builder()
          .withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0)
          .withPageSize(2048).build());
  for (int i = 0; i < rows; i++) {
    columnWriterV2.writeNull(0, 0);
    if ((i + 1) % 1000 == 0) {
      columnWriterV2.writePage();
    }
  }
  columnWriterV2.writePage();
  columnWriterV2.finalizeColumnChunk();
  List<DataPage> pages = pageWriter.getPages();
  int valueCount = 0;
  int rowCount = 0;
  for (DataPage dataPage : pages) {
    valueCount += dataPage.getValueCount();
    rowCount += ((DataPageV2)dataPage).getRowCount();
  }
  assertEquals(rows, rowCount);
  assertEquals(rows, valueCount);
  MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), pageWriter.getDictionaryPage());
  ValidatingConverter converter = new ValidatingConverter();
  ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
  for (int i = 0; i < rows; i++) {
    assertEquals(0, columnReader.getCurrentRepetitionLevel());
    assertEquals(0, columnReader.getCurrentDefinitionLevel());
    columnReader.consume();
  }
  assertEquals(0, converter.count);
}
 
Example 6
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteReadStatisticsAllNulls() throws Exception {
  // this test assumes statistics will be read
  Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));

  File testFile = temp.newFile();
  testFile.delete();

  writeSchema = "message example {\n" +
          "required binary content (UTF8);\n" +
          "}";

  Path path = new Path(testFile.toURI());

  MessageType schema = MessageTypeParser.parseMessageType(writeSchema);
  Configuration configuration = new Configuration();
  configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
  GroupWriteSupport.setSchema(schema, configuration);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(path, configuration, new GroupWriteSupport());

  Group r1 = new SimpleGroup(schema);
  writer.write(r1);
  writer.close();

  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);

  // assert the statistics object is not empty
  org.apache.parquet.column.statistics.Statistics stats = readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
  assertFalse("is empty: " + stats, stats.isEmpty());
  // assert the number of nulls are correct for the first block
  assertEquals("nulls: " + stats, 1, stats.getNumNulls());
}
 
Example 7
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testBloomFilterWriteRead() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
  File testFile = temp.newFile();
  testFile.delete();
  Path path = new Path(testFile.toURI());
  Configuration configuration = new Configuration();
  configuration.set("parquet.bloom.filter.column.names", "foo");
  String[] colPath = {"foo"};
  ColumnDescriptor col = schema.getColumnDescription(colPath);
  BinaryStatistics stats1 = new BinaryStatistics();
  ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
  w.start();
  w.startBlock(3);
  w.startColumn(col, 5, CODEC);
  w.writeDataPage(2, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(3, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  BloomFilter blockSplitBloomFilter = new BlockSplitBloomFilter(0);
  blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("hello")));
  blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("world")));
  w.addBloomFilter("foo", blockSplitBloomFilter);
  w.endBlock();
  w.end(new HashMap<>());
  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
  ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path,
    Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(colPath)));
  BloomFilterReader bloomFilterReader = r.getBloomFilterDataReader(readFooter.getBlocks().get(0));
  BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(readFooter.getBlocks().get(0).getColumns().get(0));
  assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("hello"))));
  assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("world"))));
}
 
Example 8
Source File: TestInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void createParquetFile(File file) throws IOException {
  Path path = new Path(file.toURI());
  Configuration configuration = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;}}");
  String[] columnPath = {"a", "b"};
  ColumnDescriptor c1 = schema.getColumnDescription(columnPath);

  byte[] bytes1 = { 0, 1, 2, 3};
  byte[] bytes2 = { 2, 3, 4, 5};
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  BinaryStatistics stats = new BinaryStatistics();

  ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
  w.start();
  w.startBlock(3);
  w.startColumn(c1, 5, codec);
  w.writeDataPage(2, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(3, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();
  w.startBlock(4);
  w.startColumn(c1, 7, codec);
  w.writeDataPage(7, 4, BytesInput.from(bytes2), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();
  w.end(new HashMap<String, String>());
}
 
Example 9
Source File: TestInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() {
  blocks = new ArrayList<BlockMetaData>();
  for (int i = 0; i < 10; i++) {
    blocks.add(newBlock(i * 10, 10));
  }
  schema = MessageTypeParser.parseMessageType("message doc { required binary foo; }");
  fileMetaData = new FileMetaData(schema, new HashMap<String, String>(), "parquet-mr");
}
 
Example 10
Source File: TestParquetParser.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testIntAnnotations() {
  String message = "message IntMessage {" +
      "  required int32 i8 (INT_8);" +
      "  required int32 i16 (INT_16);" +
      "  required int32 i32 (INT_32);" +
      "  required int64 i64 (INT_64);" +
      "  required int32 u8 (UINT_8);" +
      "  required int32 u16 (UINT_16);" +
      "  required int32 u32 (UINT_32);" +
      "  required int64 u64 (UINT_64);" +
      "}\n";

  MessageType parsed = MessageTypeParser.parseMessageType(message);
  MessageType expected = Types.buildMessage()
      .required(INT32).as(INT_8).named("i8")
      .required(INT32).as(INT_16).named("i16")
      .required(INT32).as(INT_32).named("i32")
      .required(INT64).as(INT_64).named("i64")
      .required(INT32).as(UINT_8).named("u8")
      .required(INT32).as(UINT_16).named("u16")
      .required(INT32).as(UINT_32).named("u32")
      .required(INT64).as(UINT_64).named("u64")
      .named("IntMessage");

  assertEquals(expected, parsed);
  MessageType reparsed = MessageTypeParser.parseMessageType(parsed.toString());
  assertEquals(expected, reparsed);
}
 
Example 11
Source File: TestParquetParser.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testTimeAnnotations() {
  String message = "message TimeMessage {" +
      "  required int32 date (DATE);" +
      "  required int32 time (TIME_MILLIS);" +
      "  required int64 timestamp (TIMESTAMP_MILLIS);" +
      "  required FIXED_LEN_BYTE_ARRAY(12) interval (INTERVAL);" +
      "  required int32 newTime (TIME(MILLIS,true));" +
      "  required int64 nanoTime (TIME(NANOS,true));" +
      "  required int64 newTimestamp (TIMESTAMP(MILLIS,false));" +
      "  required int64 nanoTimestamp (TIMESTAMP(NANOS,false));" +
      "}\n";

  MessageType parsed = MessageTypeParser.parseMessageType(message);
  MessageType expected = Types.buildMessage()
      .required(INT32).as(DATE).named("date")
      .required(INT32).as(TIME_MILLIS).named("time")
      .required(INT64).as(TIMESTAMP_MILLIS).named("timestamp")
      .required(FIXED_LEN_BYTE_ARRAY).length(12).as(INTERVAL).named("interval")
      .required(INT32).as(timeType(true, MILLIS)).named("newTime")
      .required(INT64).as(timeType(true, NANOS)).named("nanoTime")
      .required(INT64).as(timestampType(false, MILLIS)).named("newTimestamp")
      .required(INT64).as(timestampType(false, NANOS)).named("nanoTimestamp")
    .named("TimeMessage");

  assertEquals(expected, parsed);
  MessageType reparsed = MessageTypeParser.parseMessageType(parsed.toString());
  assertEquals(expected, reparsed);
}
 
Example 12
Source File: TestSchemaConverter.java    From tajo with Apache License 2.0 5 votes vote down vote up
private void testTajoToParquetConversion(
    Schema tajoSchema, String schemaString) throws Exception {
  TajoSchemaConverter converter = new TajoSchemaConverter();
  MessageType schema = converter.convert(tajoSchema);
  MessageType expected = MessageTypeParser.parseMessageType(schemaString);
  assertEquals("converting " + schema + " to " + schemaString,
               expected.toString(), schema.toString());
}
 
Example 13
Source File: ParquetFilePOJOReaderTest.java    From attic-apex-malhar with Apache License 2.0 5 votes vote down vote up
private static void writeParquetFile(String rawSchema, File outputParquetFile, List<EventRecord> data)
  throws IOException
{
  Path path = new Path(outputParquetFile.toURI());
  MessageType schema = MessageTypeParser.parseMessageType(rawSchema);
  ParquetPOJOWriter writer = new ParquetPOJOWriter(path, schema, EventRecord.class, true);
  for (EventRecord eventRecord : data) {
    writer.write(eventRecord);
  }
  writer.close();
}
 
Example 14
Source File: GroupReadSupportTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testInitWithoutSpecifyingRequestSchema() throws Exception {
  GroupReadSupport s = new GroupReadSupport();
  Configuration configuration = new Configuration();
  Map<String, String> keyValueMetaData = new HashMap<String, String>();
  MessageType fileSchema = MessageTypeParser.parseMessageType(fullSchemaStr);

  ReadSupport.ReadContext context = s.init(configuration, keyValueMetaData, fileSchema);
  assertEquals(context.getRequestedSchema(), fileSchema);
}
 
Example 15
Source File: HDFSFactorys.java    From sylph with Apache License 2.0 5 votes vote down vote up
@Override
public HDFSFactory getOrCreate()
{
    requireNonNull(schema, "schema is null");
    requireNonNull(tableName, "必须传入tableName,如表 xxx_log");
    requireNonNull(sinkConfig.getWriteDir(), "必须传入writeTableDir,如: hdfs:///tmp/hive/xxx_log");

    String schemaString = buildSchema(schema.getFields());
    MessageType type = MessageTypeParser.parseMessageType(schemaString);
    return new ParquetFactory(sinkConfig.getWriteDir(), tableName, parquetVersion, type);
}
 
Example 16
Source File: TestAvroSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void testRoundTripConversion(
    Configuration conf, Schema avroSchema, String schemaString)
    throws Exception {
  AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(conf);
  MessageType schema = avroSchemaConverter.convert(avroSchema);
  MessageType expectedMT = MessageTypeParser.parseMessageType(schemaString);
  assertEquals("converting " + schema + " to " + schemaString, expectedMT.toString(),
      schema.toString());
  Schema convertedAvroSchema = avroSchemaConverter.convert(expectedMT);
  assertEquals("converting " + expectedMT + " to " + avroSchema.toString(true),
      avroSchema.toString(), convertedAvroSchema.toString());
}
 
Example 17
Source File: TestColumnChunkPageWriteStore.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void test() throws Exception {
  Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
  Path root = file.getParent();
  FileSystem fs = file.getFileSystem(conf);
  if (fs.exists(root)) {
    fs.delete(root, true);
  }
  fs.mkdirs(root);
  MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  Encoding dataEncoding = PLAIN;
  int valueCount = 10;
  int d = 1;
  int r = 2;
  int v = 3;
  BytesInput definitionLevels = BytesInput.fromInt(d);
  BytesInput repetitionLevels = BytesInput.fromInt(r);
  Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary"))
      .build();
  BytesInput data = BytesInput.fromInt(v);
  int rowCount = 5;
  int nullCount = 1;
  statistics.incrementNumNulls(nullCount);
  statistics.setMinMaxFromBytes(new byte[] {0, 1, 2}, new byte[] {0, 1, 2, 3});
  long pageOffset;
  long pageSize;

  {
    OutputFileForTesting outputFile = new OutputFileForTesting(file, conf);
    ParquetFileWriter writer = new ParquetFileWriter(outputFile, schema, Mode.CREATE,
        ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.MAX_PADDING_SIZE_DEFAULT);
    writer.start();
    writer.startBlock(rowCount);
    pageOffset = outputFile.out().getPos();
    {
      ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema,
          new HeapByteBufferAllocator(), Integer.MAX_VALUE);
      PageWriter pageWriter = store.getPageWriter(col);
      pageWriter.writePageV2(
          rowCount, nullCount, valueCount,
          repetitionLevels, definitionLevels,
          dataEncoding, data,
          statistics);
      store.flushToFileWriter(writer);
      pageSize = outputFile.out().getPos() - pageOffset;
    }
    writer.endBlock();
    writer.end(new HashMap<String, String>());
  }

  {
    ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
    ParquetFileReader reader = new ParquetFileReader(
        conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
    PageReadStore rowGroup = reader.readNextRowGroup();
    PageReader pageReader = rowGroup.getPageReader(col);
    DataPageV2 page = (DataPageV2)pageReader.readPage();
    assertEquals(rowCount, page.getRowCount());
    assertEquals(nullCount, page.getNullCount());
    assertEquals(valueCount, page.getValueCount());
    assertEquals(d, intValue(page.getDefinitionLevels()));
    assertEquals(r, intValue(page.getRepetitionLevels()));
    assertEquals(dataEncoding, page.getDataEncoding());
    assertEquals(v, intValue(page.getData()));

    // Checking column/offset indexes for the one page
    ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0);
    ColumnIndex columnIndex = reader.readColumnIndex(column);
    assertArrayEquals(statistics.getMinBytes(), columnIndex.getMinValues().get(0).array());
    assertArrayEquals(statistics.getMaxBytes(), columnIndex.getMaxValues().get(0).array());
    assertEquals(statistics.getNumNulls(), columnIndex.getNullCounts().get(0).longValue());
    assertFalse(columnIndex.getNullPages().get(0));
    OffsetIndex offsetIndex = reader.readOffsetIndex(column);
    assertEquals(1, offsetIndex.getPageCount());
    assertEquals(pageSize, offsetIndex.getCompressedPageSize(0));
    assertEquals(0, offsetIndex.getFirstRowIndex(0));
    assertEquals(pageOffset, offsetIndex.getOffset(0));

    reader.close();
  }
}
 
Example 18
Source File: GroupWriteSupport.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
public static MessageType getSchema(Configuration configuration) {
  return MessageTypeParser.parseMessageType(configuration.get(PARQUET_EXAMPLE_SCHEMA));
}
 
Example 19
Source File: TupleWriteSupport.java    From hadoop-etl-udfs with MIT License 4 votes vote down vote up
public static MessageType getSchema(Configuration configuration) {
    return MessageTypeParser.parseMessageType(configuration.get(PARQUET_SCHEMA_PROPERTY_NAME));
}
 
Example 20
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void testMetaDataFile() throws Exception {

  File testDir = temp.newFolder();

  Path testDirPath = new Path(testDir.toURI());
  Configuration configuration = new Configuration();

  final FileSystem fs = testDirPath.getFileSystem(configuration);
  enforceEmptyDir(configuration, testDirPath);

  MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;} required group c { required int64 d; }}");
  createFile(configuration, new Path(testDirPath, "part0"), schema);
  createFile(configuration, new Path(testDirPath, "part1"), schema);
  createFile(configuration, new Path(testDirPath, "part2"), schema);

  FileStatus outputStatus = fs.getFileStatus(testDirPath);
  List<Footer> footers = ParquetFileReader.readFooters(configuration, outputStatus, false);
  validateFooters(footers);
  ParquetFileWriter.writeMetadataFile(configuration, testDirPath, footers, JobSummaryLevel.ALL);

  footers = ParquetFileReader.readFooters(configuration, outputStatus, false);
  validateFooters(footers);
  footers = ParquetFileReader.readFooters(configuration, fs.getFileStatus(new Path(testDirPath, "part0")), false);
  assertEquals(1, footers.size());

  final FileStatus metadataFile = fs.getFileStatus(new Path(testDirPath, ParquetFileWriter.PARQUET_METADATA_FILE));
  final FileStatus metadataFileLight = fs.getFileStatus(new Path(testDirPath, ParquetFileWriter.PARQUET_COMMON_METADATA_FILE));
  final List<Footer> metadata = ParquetFileReader.readSummaryFile(configuration, metadataFile);

  validateFooters(metadata);

  footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(configuration, Arrays.asList(fs.listStatus(testDirPath, HiddenFileFilter.INSTANCE)), false);
  validateFooters(footers);

  fs.delete(metadataFile.getPath(), false);
  fs.delete(metadataFileLight.getPath(), false);

  footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(configuration, Arrays.asList(fs.listStatus(testDirPath)), false);
  validateFooters(footers);

}