org.apache.parquet.hadoop.example.GroupReadSupport Java Examples

The following examples show how to use org.apache.parquet.hadoop.example.GroupReadSupport. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestFiltersWithMissingColumns.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static long countFilteredRecords(Path path, FilterPredicate pred) throws IOException{
  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), path)
      .withFilter(FilterCompat.get(pred))
      .build();

  long count = 0;
  try {
    while (reader.read() != null) {
      count += 1;
    }
  } finally {
    reader.close();
  }
  return count;
}
 
Example #2
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void validateColumns(String inputFile, List<String> prunePaths) throws IOException {
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(inputFile)).withConf(conf).build();
  for (int i = 0; i < numRecord; i++) {
    Group group = reader.read();
    if (!prunePaths.contains("DocId")) {
      assertEquals(1l, group.getLong("DocId", 0));
    }
    if (!prunePaths.contains("Name")) {
      assertEquals("foo", group.getBinary("Name", 0).toStringUsingUTF8());
    }
    if (!prunePaths.contains("Gender")) {
      assertEquals("male", group.getBinary("Gender", 0).toStringUsingUTF8());
    }
    if (!prunePaths.contains("Links")) {
      Group subGroup = group.getGroup("Links", 0);
      if (!prunePaths.contains("Links.Backward")) {
        assertEquals(2l, subGroup.getLong("Backward", 0));
      }
      if (!prunePaths.contains("Links.Forward")) {
        assertEquals(3l, subGroup.getLong("Forward", 0));
      }
    }
  }
  reader.close();
}
 
Example #3
Source File: PageChecksumReadBenchmarks.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void readFile(Path file, int nRows, boolean verifyChecksums, Blackhole blackhole)
  throws IOException {
  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file)
      .withConf(configuration)
      .usePageChecksumVerification(verifyChecksums)
      .build()) {
    for (int i = 0; i < nRows; i++) {
      Group group = reader.read();
      blackhole.consume(group.getLong("long_field", 0));
      blackhole.consume(group.getBinary("binary_field", 0));
      Group subgroup = group.getGroup("group", 0);
      blackhole.consume(subgroup.getInteger("int_field", 0));
      blackhole.consume(subgroup.getInteger("int_field", 1));
      blackhole.consume(subgroup.getInteger("int_field", 2));
      blackhole.consume(subgroup.getInteger("int_field", 3));
    }
  }
}
 
Example #4
Source File: ReadBenchmarks.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void read(Path parquetFile, int nRows, Blackhole blackhole) throws IOException
{
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), parquetFile).withConf(configuration).build();
  for (int i = 0; i < nRows; i++) {
    Group group = reader.read();
    blackhole.consume(group.getBinary("binary_field", 0));
    blackhole.consume(group.getInteger("int32_field", 0));
    blackhole.consume(group.getLong("int64_field", 0));
    blackhole.consume(group.getBoolean("boolean_field", 0));
    blackhole.consume(group.getFloat("float_field", 0));
    blackhole.consume(group.getDouble("double_field", 0));
    blackhole.consume(group.getBinary("flba_field", 0));
    blackhole.consume(group.getInt96("int96_field", 0));
  }
  reader.close();
}
 
Example #5
Source File: TestThriftToParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ParquetReader<Group> createRecordReader(Path parquetFilePath) throws IOException {
  Configuration configuration = new Configuration(true);

  GroupReadSupport readSupport = new GroupReadSupport();
  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
  MessageType schema = readFooter.getFileMetaData().getSchema();

  readSupport.init(configuration, null, schema);
  return new ParquetReader<Group>(parquetFilePath, readSupport);
}
 
Example #6
Source File: DeprecatedInputFormatTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void runMapReduceJob(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException {

    final FileSystem fileSystem = parquetPath.getFileSystem(conf);
    fileSystem.delete(parquetPath, true);
    fileSystem.delete(outputPath, true);
    {
      writeJob = new Job(conf, "write");
      TextInputFormat.addInputPath(writeJob, inputPath);
      writeJob.setInputFormatClass(TextInputFormat.class);
      writeJob.setNumReduceTasks(0);
      ExampleOutputFormat.setCompression(writeJob, codec);
      ExampleOutputFormat.setOutputPath(writeJob, parquetPath);
      writeJob.setOutputFormatClass(ExampleOutputFormat.class);
      writeJob.setMapperClass(ReadMapper.class);
      ExampleOutputFormat.setSchema(
              writeJob,
              MessageTypeParser.parseMessageType(
                      writeSchema));
      writeJob.submit();
      waitForJob(writeJob);
    }
    {
      jobConf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
      jobConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, GroupReadSupport.class.getCanonicalName());
      jobConf.setInputFormat(MyDeprecatedInputFormat.class);
      MyDeprecatedInputFormat.setInputPaths(jobConf, parquetPath);
      jobConf.setOutputFormat(org.apache.hadoop.mapred.TextOutputFormat.class);
      org.apache.hadoop.mapred.TextOutputFormat.setOutputPath(jobConf, outputPath);
      jobConf.setMapperClass(DeprecatedWriteMapper.class);
      jobConf.setNumReduceTasks(0);
      mapRedJob = JobClient.runJob(jobConf);
    }
  }
 
Example #7
Source File: TestColumnIndexFiltering.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private List<User> readUsersWithProjection(Filter filter, MessageType schema, boolean useOtherFiltering, boolean useColumnIndexFilter) throws IOException {
  return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file)
      .withFilter(filter)
      .useDictionaryFilter(useOtherFiltering)
      .useStatsFilter(useOtherFiltering)
      .useRecordFilter(useOtherFiltering)
      .useColumnIndexFilter(useColumnIndexFilter)
      .set(ReadSupport.PARQUET_READ_SCHEMA, schema.toString()));
}
 
Example #8
Source File: TestColumnIndexFiltering.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private List<User> readUsers(Filter filter, boolean useOtherFiltering, boolean useColumnIndexFilter)
    throws IOException {
  return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file)
      .withFilter(filter)
      .useDictionaryFilter(useOtherFiltering)
      .useStatsFilter(useOtherFiltering)
      .useRecordFilter(useOtherFiltering)
      .useColumnIndexFilter(useColumnIndexFilter));
}
 
Example #9
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateColumns(String file, int numRecord, TestDocs testDocs) throws IOException {
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(file)).withConf(conf).build();
  for (int i = 0; i < numRecord; i++) {
    Group group = reader.read();
    assertTrue(group.getLong("DocId", 0) == testDocs.docId[i]);
    assertArrayEquals(group.getBinary("Name", 0).getBytes(), testDocs.name[i].getBytes());
    assertArrayEquals(group.getBinary("Gender", 0).getBytes(), testDocs.gender[i].getBytes());
    Group subGroup = group.getGroup("Links", 0);
    assertArrayEquals(subGroup.getBinary("Backward", 0).getBytes(), testDocs.linkBackward[i].getBytes());
    assertArrayEquals(subGroup.getBinary("Forward", 0).getBytes(), testDocs.linkForward[i].getBytes());
  }
  reader.close();
}
 
Example #10
Source File: TestMultipleWriteRead.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateFile(Path file, Filter filter, Stream<Group> data) throws IOException {
  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file)
      .withFilter(filter)
      .build()) {
    for (Iterator<Group> it = data.iterator(); it.hasNext();) {
      assertEquals(it.next().toString(), reader.read().toString());
    }
  }
}
 
Example #11
Source File: TestMultipleWriteRead.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateFile(Path file, List<Group> data) throws IOException {
  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).build()) {
    for (Group group : data) {
      assertEquals(group.toString(), reader.read().toString());
    }
  }
}
 
Example #12
Source File: TestParquetWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testNullValuesWithPageRowLimit() throws IOException {
  MessageType schema = Types.buildMessage().optionalList().optionalElement(BINARY).as(stringType()).named("str_list")
      .named("msg");
  final int recordCount = 100;
  Configuration conf = new Configuration();
  GroupWriteSupport.setSchema(schema, conf);

  GroupFactory factory = new SimpleGroupFactory(schema);
  Group listNull = factory.newGroup();

  File file = temp.newFile();
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
      .withPageRowCountLimit(10)
      .withConf(conf)
      .build()) {
    for (int i = 0; i < recordCount; ++i) {
      writer.write(listNull);
    }
  }

  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), path).build()) {
    int readRecordCount = 0;
    for (Group group = reader.read(); group != null; group = reader.read()) {
      assertEquals(listNull.toString(), group.toString());
      ++readRecordCount;
    }
    assertEquals("Number of written records should be equal to the read one", recordCount, readRecordCount);
  }
}
 
Example #13
Source File: PhoneBookWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static ParquetReader<Group> createReader(Path file, Filter filter) throws IOException {
  Configuration conf = new Configuration();
  GroupWriteSupport.setSchema(schema, conf);

  return ParquetReader.builder(new GroupReadSupport(), file)
      .withConf(conf)
      .withFilter(filter)
      .build();
}
 
Example #14
Source File: TestBloomFiltering.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private List<PhoneBookWriter.User> readUsers(FilterPredicate filter, boolean useOtherFiltering,
                                             boolean useBloomFilter) throws IOException {
  return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file)
    .withFilter(FilterCompat.get(filter))
    .useDictionaryFilter(useOtherFiltering)
    .useStatsFilter(useOtherFiltering)
    .useRecordFilter(useOtherFiltering)
    .useBloomFilter(useBloomFilter)
    .useColumnIndexFilter(useOtherFiltering));
}
 
Example #15
Source File: ParquetFileAccessor.java    From pxf with Apache License 2.0 5 votes vote down vote up
/**
 * Opens the resource for read.
 *
 * @throws IOException if opening the resource failed
 */
@Override
public boolean openForRead() throws IOException {
    file = new Path(context.getDataSource());
    FileSplit fileSplit = HdfsUtilities.parseFileSplit(context);

    // Read the original schema from the parquet file
    MessageType originalSchema = getSchema(file, fileSplit);
    // Get a map of the column name to Types for the given schema
    Map<String, Type> originalFieldsMap = getOriginalFieldsMap(originalSchema);
    // Get the read schema. This is either the full set or a subset (in
    // case of column projection) of the greenplum schema.
    MessageType readSchema = buildReadSchema(originalFieldsMap, originalSchema);
    // Get the record filter in case of predicate push-down
    FilterCompat.Filter recordFilter = getRecordFilter(context.getFilterString(), originalFieldsMap, readSchema);

    // add column projection
    configuration.set(PARQUET_READ_SCHEMA, readSchema.toString());

    fileReader = ParquetReader.builder(new GroupReadSupport(), file)
            .withConf(configuration)
            // Create reader for a given split, read a range in file
            .withFileRange(fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength())
            .withFilter(recordFilter)
            .build();
    context.setMetadata(readSchema);
    return true;
}
 
Example #16
Source File: TestParquetWriterAppendBlocks.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testAllowDroppingColumns() throws IOException {
  MessageType droppedColumnSchema = Types.buildMessage()
      .required(BINARY).as(UTF8).named("string")
      .named("AppendTest");

  Path droppedColumnFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, droppedColumnSchema, droppedColumnFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  LinkedList<Group> expected = new LinkedList<Group>();
  expected.addAll(file1content);
  expected.addAll(file2content);

  ParquetMetadata footer = ParquetFileReader.readFooter(
      CONF, droppedColumnFile, NO_FILTER);
  for (BlockMetaData rowGroup : footer.getBlocks()) {
    Assert.assertEquals("Should have only the string column",
        1, rowGroup.getColumns().size());
  }

  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), droppedColumnFile)
      .build();

  Group next;
  while ((next = reader.read()) != null) {
    Group expectedNext = expected.removeFirst();
    Assert.assertEquals("Each string should match",
        expectedNext.getString("string", 0), next.getString("string", 0));
  }

  Assert.assertEquals("All records should be present", 0, expected.size());
}
 
Example #17
Source File: TestParquetWriterAppendBlocks.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testBasicBehavior() throws IOException {
  Path combinedFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, FILE_SCHEMA, combinedFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  LinkedList<Group> expected = new LinkedList<Group>();
  expected.addAll(file1content);
  expected.addAll(file2content);

  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), combinedFile)
      .build();

  Group next;
  while ((next = reader.read()) != null) {
    Group expectedNext = expected.removeFirst();
    // check each value; equals is not supported for simple records
    Assert.assertEquals("Each id should match",
        expectedNext.getInteger("id", 0), next.getInteger("id", 0));
    Assert.assertEquals("Each string should match",
        expectedNext.getString("string", 0), next.getString("string", 0));
  }

  Assert.assertEquals("All records should be present", 0, expected.size());
}
 
Example #18
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ParquetReader.Builder<Group> createReaderBuilder() throws IOException {
  ReadConfigurator readConfigurator = getReadConfigurator();
  return readConfigurator.configureBuilder(
      new ParquetReader.Builder<Group>(HadoopInputFile.fromPath(file, new Configuration())) {
        @Override
        protected ReadSupport<Group> getReadSupport() {
          return new GroupReadSupport();
        }
      }.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString()));
}
 
Example #19
Source File: AbstractParquetFileReader.java    From attic-apex-malhar with Apache License 2.0 5 votes vote down vote up
/**
 * Opens the file to read using GroupReadSupport
 */
@Override
protected InputStream openFile(Path path) throws IOException
{
  InputStream is = super.openFile(path);
  GroupReadSupport readSupport = new GroupReadSupport();
  readSupport.init(configuration, null, schema);
  reader = new ParquetReader<>(path, readSupport);
  return is;
}
 
Example #20
Source File: TestConvertAvroToParquet.java    From nifi with Apache License 2.0 4 votes vote down vote up
@Test
public void test_Data() throws Exception {


    FileInputStream fileInputStream = new FileInputStream(tmpAvro);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    int readedBytes;
    byte[] buf = new byte[1024];
    while ((readedBytes = fileInputStream.read(buf)) > 0) {
        out.write(buf, 0, readedBytes);
    }
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToParquet.SUCCESS).get(0);

    // Save the flowfile
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream(tmpParquet);
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(tmpParquet.getAbsolutePath()))
                    .withConf(conf)
                    .build();

    List<Group> parquetRecords = new ArrayList<Group>();

    Group current;
    current = reader.read();
    while (current != null) {
        assertTrue(current instanceof Group);
        parquetRecords.add(current);
        current = reader.read();
    }

    Group firstRecord = parquetRecords.get(0);

    // Primitive
    assertEquals(firstRecord.getInteger("myint", 0), 1);
    assertEquals(firstRecord.getLong("mylong", 0), 2);
    assertEquals(firstRecord.getBoolean("myboolean", 0), true);
    assertEquals(firstRecord.getFloat("myfloat", 0), 3.1, 0.0001);
    assertEquals(firstRecord.getDouble("mydouble", 0), 4.1, 0.001);
    assertEquals(firstRecord.getString("mybytes", 0), "hello");
    assertEquals(firstRecord.getString("mystring", 0), "hello");

    // Nested
    assertEquals(firstRecord.getGroup("mynestedrecord",0).getInteger("mynestedint",0), 1);

    // Array
    assertEquals(firstRecord.getGroup("myarray",0).getGroup("list",0).getInteger("element", 0), 1);
    assertEquals(firstRecord.getGroup("myarray",0).getGroup("list",1).getInteger("element", 0), 2);

    // Map
    assertEquals(firstRecord.getGroup("mymap",0).getGroup("map",0).getInteger("value", 0), 1);
    assertEquals(firstRecord.getGroup("mymap",0).getGroup("map",1).getInteger("value", 0), 2);

    // Fixed
    assertEquals(firstRecord.getString("myfixed",0), "A");

}
 
Example #21
Source File: TestParquetWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  Path root = new Path("target/tests/TestParquetWriter/");
  enforceEmptyDir(conf, root);
  MessageType schema = parseMessageType(
      "message test { "
      + "required binary binary_field; "
      + "required int32 int32_field; "
      + "required int64 int64_field; "
      + "required boolean boolean_field; "
      + "required float float_field; "
      + "required double double_field; "
      + "required fixed_len_byte_array(3) flba_field; "
      + "required int96 int96_field; "
      + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  Map<String, Encoding> expected = new HashMap<String, Encoding>();
  expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
  expected.put("1000-" + PARQUET_1_0, PLAIN);
  expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
  expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
  for (int modulo : asList(10, 1000)) {
    for (WriterVersion version : WriterVersion.values()) {
      Path file = new Path(root, version.name() + "_" + modulo);
      ParquetWriter<Group> writer = new ParquetWriter<Group>(
          file,
          new GroupWriteSupport(),
          UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
      for (int i = 0; i < 1000; i++) {
        writer.write(
            f.newGroup()
            .append("binary_field", "test" + (i % modulo))
            .append("int32_field", 32)
            .append("int64_field", 64l)
            .append("boolean_field", true)
            .append("float_field", 1.0f)
            .append("double_field", 2.0d)
            .append("flba_field", "foo")
            .append("int96_field", Binary.fromConstantByteArray(new byte[12])));
      }
      writer.close();
      ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
      for (int i = 0; i < 1000; i++) {
        Group group = reader.read();
        assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
        assertEquals(32, group.getInteger("int32_field", 0));
        assertEquals(64l, group.getLong("int64_field", 0));
        assertEquals(true, group.getBoolean("boolean_field", 0));
        assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
        assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
        assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
        assertEquals(Binary.fromConstantByteArray(new byte[12]),
            group.getInt96("int96_field",0));
      }
      reader.close();
      ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
      for (BlockMetaData blockMetaData : footer.getBlocks()) {
        for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
          if (column.getPath().toDotString().equals("binary_field")) {
            String key = modulo + "-" + version;
            Encoding expectedEncoding = expected.get(key);
            assertTrue(
                key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
                column.getEncodings().contains(expectedEncoding));
          }
        }
      }
      assertEquals("Object model property should be example",
          "example", footer.getFileMetaData().getKeyValueMetaData()
              .get(ParquetWriter.OBJECT_MODEL_NAME_PROP));
    }
  }
}
 
Example #22
Source File: TestParquetWriterNewPage.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  Path root = new Path("target/tests/TestParquetWriter/");
  FileSystem fs = root.getFileSystem(conf);
  if (fs.exists(root)) {
    fs.delete(root, true);
  }
  fs.mkdirs(root);
  MessageType schema = parseMessageType(
      "message test { "
      + "required binary binary_field; "
      + "required int32 int32_field; "
      + "required int64 int64_field; "
      + "required boolean boolean_field; "
      + "required float float_field; "
      + "required double double_field; "
      + "required fixed_len_byte_array(3) flba_field; "
      + "required int96 int96_field; "
      + "optional binary null_field; "
      + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  Map<String, Encoding> expected = new HashMap<String, Encoding>();
  expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
  expected.put("1000-" + PARQUET_1_0, PLAIN);
  expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
  expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
  for (int modulo : asList(10, 1000)) {
    for (WriterVersion version : WriterVersion.values()) {
      Path file = new Path(root, version.name() + "_" + modulo);
      ParquetWriter<Group> writer = new ParquetWriter<Group>(
          file,
          new GroupWriteSupport(),
          UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
      for (int i = 0; i < 1000; i++) {
        writer.write(
            f.newGroup()
            .append("binary_field", "test" + (i % modulo))
            .append("int32_field", 32)
            .append("int64_field", 64l)
            .append("boolean_field", true)
            .append("float_field", 1.0f)
            .append("double_field", 2.0d)
            .append("flba_field", "foo")
            .append("int96_field", Binary.fromConstantByteArray(new byte[12])));
      }
      writer.close();

      ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
      for (int i = 0; i < 1000; i++) {
        Group group = reader.read();
        assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
        assertEquals(32, group.getInteger("int32_field", 0));
        assertEquals(64l, group.getLong("int64_field", 0));
        assertEquals(true, group.getBoolean("boolean_field", 0));
        assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
        assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
        assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
        assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field",
            0));
        assertEquals(0, group.getFieldRepetitionCount("null_field"));
      }
      reader.close();
      ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
      for (BlockMetaData blockMetaData : footer.getBlocks()) {
        for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
          if (column.getPath().toDotString().equals("binary_field")) {
            String key = modulo + "-" + version;
            Encoding expectedEncoding = expected.get(key);
            assertTrue(
                key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
                column.getEncodings().contains(expectedEncoding));
          }
        }
      }
    }
  }
}
 
Example #23
Source File: TestInputOutputFormatWithPadding.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void testBasicBehaviorWithPadding() throws Exception {
  HadoopOutputFile.getBlockFileSystems().add("file");

  File inputFile = temp.newFile();
  FileOutputStream out = new FileOutputStream(inputFile);
  out.write(FILE_CONTENT.getBytes("UTF-8"));
  out.close();

  File tempFolder = temp.newFolder();
  tempFolder.delete();
  Path tempPath = new Path(tempFolder.toURI());

  File outputFolder = temp.newFile();
  outputFolder.delete();

  Configuration conf = new Configuration();
  // May test against multiple hadoop versions
  conf.set("dfs.block.size", "1024");
  conf.set("dfs.blocksize", "1024");
  conf.set("dfs.blockSize", "1024");
  conf.set("fs.local.block.size", "1024");

  // don't use a cached FS with a different block size
  conf.set("fs.file.impl.disable.cache", "true");

  // disable summary metadata, it isn't needed
  conf.set("parquet.enable.summary-metadata", "false");
  conf.set("parquet.example.schema", PARQUET_TYPE.toString());

  {
    Job writeJob = new Job(conf, "write");
    writeJob.setInputFormatClass(TextInputFormat.class);
    TextInputFormat.addInputPath(writeJob, new Path(inputFile.toString()));

    writeJob.setOutputFormatClass(ParquetOutputFormat.class);
    writeJob.setMapperClass(Writer.class);
    writeJob.setNumReduceTasks(0); // write directly to Parquet without reduce
    ParquetOutputFormat.setWriteSupportClass(writeJob, GroupWriteSupport.class);
    ParquetOutputFormat.setBlockSize(writeJob, 1024);
    ParquetOutputFormat.setPageSize(writeJob, 512);
    ParquetOutputFormat.setDictionaryPageSize(writeJob, 512);
    ParquetOutputFormat.setEnableDictionary(writeJob, true);
    ParquetOutputFormat.setMaxPaddingSize(writeJob, 1023); // always pad
    ParquetOutputFormat.setOutputPath(writeJob, tempPath);

    waitForJob(writeJob);
  }

  // make sure padding was added
  File parquetFile = getDataFile(tempFolder);
  ParquetMetadata footer = ParquetFileReader.readFooter(conf,
      new Path(parquetFile.toString()), ParquetMetadataConverter.NO_FILTER);
  for (BlockMetaData block : footer.getBlocks()) {
    Assert.assertTrue("Block should start at a multiple of the block size",
        block.getStartingPos() % 1024 == 0);
  }

  {
    Job readJob = new Job(conf, "read");
    readJob.setInputFormatClass(NoSplits.class);
    ParquetInputFormat.setReadSupportClass(readJob, GroupReadSupport.class);
    TextInputFormat.addInputPath(readJob, tempPath);

    readJob.setOutputFormatClass(TextOutputFormat.class);
    readJob.setMapperClass(Reader.class);
    readJob.setNumReduceTasks(0); // write directly to text without reduce
    TextOutputFormat.setOutputPath(readJob, new Path(outputFolder.toString()));

    waitForJob(readJob);
  }

  File dataFile = getDataFile(outputFolder);
  Assert.assertNotNull("Should find a data file", dataFile);

  StringBuilder contentBuilder = new StringBuilder();
  for (String line : Files.readAllLines(dataFile.toPath(), StandardCharsets.UTF_8)) {
    contentBuilder.append(line);
  }
  String reconstructed = contentBuilder.toString();
  Assert.assertEquals("Should match written file content",
      FILE_CONTENT, reconstructed);

  HadoopOutputFile.getBlockFileSystems().remove("file");
}