org.apache.parquet.hadoop.example.GroupWriteSupport Java Examples

The following examples show how to use org.apache.parquet.hadoop.example.GroupWriteSupport. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ApacheParquet.java    From sylph with Apache License 2.0 7 votes vote down vote up
private ApacheParquet(String outputPath, MessageType schema, WriterVersion writerVersion)
        throws IOException
{
    this.schema = schema;
    this.outputPath = outputPath;

    Configuration configuration = new Configuration();
    GroupWriteSupport.setSchema(schema, configuration);

    this.writer = ExampleParquetWriter.builder(new Path(outputPath))
            .withType(schema)
            .withConf(configuration)
            .withPageSize(DEFAULT_PAGE_SIZE)
            .withDictionaryPageSize(DEFAULT_PAGE_SIZE)
            .withDictionaryEncoding(DEFAULT_IS_DICTIONARY_ENABLED)
            .withValidation(DEFAULT_IS_VALIDATING_ENABLED)
            .withWriterVersion(writerVersion)
            .withRowGroupSize(DEFAULT_BLOCK_SIZE) // set Parquet file block size and page size values
            .withCompressionCodec(CompressionCodecName.UNCOMPRESSED) //压缩类型
            .build();

    this.groupFactory = new SimpleGroupFactory(this.schema);
}
 
Example #2
Source File: DeprecatedOutputFormatTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void runMapReduceJob(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException {

    final FileSystem fileSystem = parquetPath.getFileSystem(conf);
    fileSystem.delete(parquetPath, true);
    fileSystem.delete(outputPath, true);
    {
      jobConf.setInputFormat(TextInputFormat.class);
      TextInputFormat.addInputPath(jobConf, inputPath);
      jobConf.setNumReduceTasks(0);

      jobConf.setOutputFormat(DeprecatedParquetOutputFormat.class);
      DeprecatedParquetOutputFormat.setCompression(jobConf, codec);
      DeprecatedParquetOutputFormat.setOutputPath(jobConf, parquetPath);
      DeprecatedParquetOutputFormat.setWriteSupportClass(jobConf, GroupWriteSupport.class);
      GroupWriteSupport.setSchema(MessageTypeParser.parseMessageType(writeSchema), jobConf);

      jobConf.setMapperClass(DeprecatedMapper.class);
      mapRedJob = JobClient.runJob(jobConf);
    }
  }
 
Example #3
Source File: ColumnSizeCommandTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private String createParquetFile() throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, INT32, "Num"));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = randomParquetFile().getAbsolutePath();
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  Random rnd = new Random();
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", rnd.nextLong());
      g.add("Num", rnd.nextInt());
      writer.write(g);
    }
  }

  return file;
}
 
Example #4
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static void writeAndTest(WriteContext context) throws IOException {
  // Create the configuration, and then apply the schema to our configuration.
  Configuration configuration = new Configuration();
  GroupWriteSupport.setSchema(context.schema, configuration);
  GroupWriteSupport groupWriteSupport = new GroupWriteSupport();

  // Create the writer properties
  final int blockSize = context.blockSize;
  final int pageSize = context.pageSize;
  final int dictionaryPageSize = pageSize;
  final boolean enableDictionary = context.enableDictionary;
  final boolean enableValidation = context.enableValidation;
  ParquetProperties.WriterVersion writerVersion = context.version;
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  ParquetWriter<Group> writer = new ParquetWriter<Group>(context.fsPath,
      groupWriteSupport, codec, blockSize, pageSize, dictionaryPageSize,
      enableDictionary, enableValidation, writerVersion, configuration);

  context.write(writer);
  writer.close();

  context.test();

  context.path.delete();
}
 
Example #5
Source File: TestZstandardCodec.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private RunningJob runMapReduceJob(CompressionCodecName codec, JobConf jobConf, Configuration conf, Path parquetPath) throws IOException, ClassNotFoundException, InterruptedException {
  String writeSchema = "message example {\n" +
    "required int32 line;\n" +
    "required binary content;\n" +
    "}";

  FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  jobConf.setInputFormat(TextInputFormat.class);
  TextInputFormat.addInputPath(jobConf, inputPath);
  jobConf.setNumReduceTasks(0);
  jobConf.setOutputFormat(DeprecatedParquetOutputFormat.class);
  DeprecatedParquetOutputFormat.setCompression(jobConf, codec);
  DeprecatedParquetOutputFormat.setOutputPath(jobConf, parquetPath);
  DeprecatedParquetOutputFormat.setWriteSupportClass(jobConf, GroupWriteSupport.class);
  GroupWriteSupport.setSchema(MessageTypeParser.parseMessageType(writeSchema), jobConf);

  jobConf.setMapperClass(TestZstandardCodec.DumpMapper.class);
  return JobClient.runJob(jobConf);
}
 
Example #6
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Setup
public void writeFile() throws IOException {
  WriteConfigurator writeConfigurator = getWriteConfigurator();
  file = new Path(
      Files.createTempFile("benchmark-filtering_" + characteristic + '_' + writeConfigurator + '_', ".parquet")
          .toAbsolutePath().toString());
  long[] data = generateData();
  characteristic.arrangeData(data);
  try (ParquetWriter<Group> writer = writeConfigurator.configureBuilder(ExampleParquetWriter.builder(file)
      .config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString())
      .withRowGroupSize(Integer.MAX_VALUE) // Ensure to have one row-group per file only
      .withWriteMode(OVERWRITE))
      .build()) {
    for (long value : data) {
      Group group = new SimpleGroup(SCHEMA);
      group.add(0, value);
      group.add(1, Binary.fromString(dummyGenerator.nextString()));
      group.add(2, Binary.fromString(dummyGenerator.nextString()));
      group.add(3, Binary.fromString(dummyGenerator.nextString()));
      group.add(4, Binary.fromString(dummyGenerator.nextString()));
      group.add(5, Binary.fromString(dummyGenerator.nextString()));
      writer.write(group);
    }
  }
}
 
Example #7
Source File: SqlInterpreterTest.java    From zeppelin with Apache License 2.0 6 votes vote down vote up
public File createParquetFile(int[] values,
                              ParquetProperties.WriterVersion version) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".par");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType(
          "message test { "
                  + "required int32 int32_field; "
                  + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(
          path,
          new GroupWriteSupport(),
          CompressionCodecName.UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
  for (int i = 0; i < values.length; i++) {
    writer.write(f.newGroup()
            .append("int32_field", values[i]));
  }
  writer.close();
  return file;
}
 
Example #8
Source File: TestColumnSizeCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private String createParquetFile() throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, INT32, "Num"));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = randomParquetFile().getAbsolutePath();
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  Random rnd = new Random();
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", rnd.nextLong());
      g.add("Num", rnd.nextInt());
      writer.write(g);
    }
  }

  return file;
}
 
Example #9
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private String createParquetFile(Configuration conf, Map<String, String> extraMeta, int numRecord, String prefix, String codec,
                                       ParquetProperties.WriterVersion writerVersion, int pageSize, TestDocs testDocs) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, BINARY, "Backward"),
      new PrimitiveType(REPEATED, BINARY, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file))
    .withConf(conf)
    .withWriterVersion(writerVersion)
    .withExtraMetaData(extraMeta)
    .withDictionaryEncoding("DocId", true)
    .withValidation(true)
    .enablePageWriteChecksum()
    .withPageSize(pageSize)
    .withCompressionCodec(CompressionCodecName.valueOf(codec));
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", testDocs.docId[i]);
      g.add("Name", testDocs.name[i]);
      g.add("Gender", testDocs.gender[i]);
      Group links = g.addGroup("Links");
      links.add(0, testDocs.linkBackward[i]);
      links.add(1, testDocs.linkForward[i]);
      writer.write(g);
    }
  }

  return file;
}
 
Example #10
Source File: ParquetFileTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void createTestParquetFile() throws IOException {
  File file = parquetFile();
  Path fsPath = new Path(file.getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    for (int i = 0; i < 10; i++) {
      final byte[] bytes = new byte[12];
      ThreadLocalRandom.current().nextBytes(bytes);

      writer.write(fact.newGroup()
       .append(INT32_FIELD, 32 + i)
       .append(INT64_FIELD, 64L + i)
       .append(FLOAT_FIELD, 1.0f + i)
       .append(DOUBLE_FIELD, 2.0d + i)
       .append(BINARY_FIELD, Binary.fromString(COLORS[i % COLORS.length]))
       .append(FIXED_LEN_BYTE_ARRAY_FIELD,
         Binary.fromConstantByteArray(bytes)));
    }
  }
}
 
Example #11
Source File: PhoneBookWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static List<User> readUsers(ParquetReader.Builder<Group> builder) throws IOException {
  ParquetReader<Group> reader = builder.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString()).build();

  List<User> users = new ArrayList<>();
  for (Group group = reader.read(); group != null; group = reader.read()) {
    users.add(userFromGroup(group));
  }
  return users;
}
 
Example #12
Source File: PhoneBookWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static ParquetReader<Group> createReader(Path file, Filter filter) throws IOException {
  Configuration conf = new Configuration();
  GroupWriteSupport.setSchema(schema, conf);

  return ParquetReader.builder(new GroupReadSupport(), file)
      .withConf(conf)
      .withFilter(filter)
      .build();
}
 
Example #13
Source File: PhoneBookWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static void write(ParquetWriter.Builder<Group, ?> builder, List<User> users) throws IOException {
  builder.config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());
  try (ParquetWriter<Group> writer = builder.build()) {
    for (User u : users) {
      writer.write(groupFromUser(u));
    }
  }
}
 
Example #14
Source File: DictionaryFilterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void prepareFile(WriterVersion version, Path file) throws IOException {
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .withWriterVersion(version)
      .withCompressionCodec(GZIP)
      .withRowGroupSize(1024*1024)
      .withPageSize(1024)
      .enableDictionaryEncoding()
      .withDictionaryPageSize(2*1024)
      .withConf(conf)
      .build();
  writeData(f, writer);
}
 
Example #15
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Writes a set of values to a parquet file.
 * The ParquetWriter will write the values with dictionary encoding disabled so that we test specific encodings for
 */
private void writeValuesToFile(Path file, PrimitiveTypeName type, List<?> values, int rowGroupSize, int pageSize, boolean enableDictionary, WriterVersion version) throws IOException {
  MessageType schema;
  if (type == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
    schema = Types.buildMessage().required(type).length(FIXED_LENGTH).named("field").named("test");
  } else {
    schema = Types.buildMessage().required(type).named("field").named("test");
  }

  SimpleGroupFactory message = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, configuration);

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .withCompressionCodec(compression)
      .withRowGroupSize(rowGroupSize)
      .withPageSize(pageSize)
      .withDictionaryPageSize(TEST_DICT_PAGE_SIZE)
      .withDictionaryEncoding(enableDictionary)
      .withWriterVersion(version)
      .withConf(configuration)
      .build();

  for (Object o: values) {
    switch (type) {
      case BOOLEAN:
        writer.write(message.newGroup().append("field", (Boolean)o));
      break;
      case INT32:
        writer.write(message.newGroup().append("field", (Integer)o));
      break;
      case INT64:
        writer.write(message.newGroup().append("field", (Long)o));
      break;
      case FLOAT:
        writer.write(message.newGroup().append("field", (Float)o));
      break;
      case DOUBLE:
        writer.write(message.newGroup().append("field", (Double)o));
      break;
      case INT96:
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
        writer.write(message.newGroup().append("field", (Binary)o));
      break;
      default:
        throw new IllegalArgumentException("Unknown type name: " + type);
    }
  }

  writer.close();
}
 
Example #16
Source File: TestMemoryManager.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() throws Exception {
  parquetOutputFormat = new ParquetOutputFormat(new GroupWriteSupport());

  GroupWriteSupport.setSchema(MessageTypeParser.parseMessageType(writeSchema), conf);
  expectedPoolSize = Math.round((double)
      ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getMax() *
      MemoryManager.DEFAULT_MEMORY_POOL_RATIO);

  long rowGroupSize = expectedPoolSize / 2;
  conf.setLong(ParquetOutputFormat.BLOCK_SIZE, rowGroupSize);

  // the memory manager is not initialized until a writer is created
  createWriter(0).close(null);
}
 
Example #17
Source File: TestMergeMetadataFiles.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void writeFile(File out, Configuration conf, boolean useSchema2) throws IOException {
  if (!useSchema2) {
    GroupWriteSupport.setSchema(schema, conf);
  } else {
    GroupWriteSupport.setSchema(schema2, conf);
  }
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  Map<String, String> extraMetaData = new HashMap<String, String>();
  extraMetaData.put("schema_num", useSchema2 ? "2" : "1" );

  ParquetWriter<Group> writer = ExampleParquetWriter
      .builder(new Path(out.getAbsolutePath()))
      .withConf(conf)
      .withExtraMetaData(extraMetaData)
      .build();

    for (int i = 0; i < 1000; i++) {
      Group g = f.newGroup()
          .append("binary_field", "test" + i)
          .append("int32_field", i)
          .append("int64_field", (long) i)
          .append("boolean_field", i % 2 == 0)
          .append("float_field", (float) i)
          .append("double_field", (double)i)
          .append("flba_field", "foo");

      if (!useSchema2) {
        g = g.append("int96_field", Binary.fromConstantByteArray(new byte[12]));
      }

      writer.write(g);
    }
    writer.close();
}
 
Example #18
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteReadStatisticsAllNulls() throws Exception {
  // this test assumes statistics will be read
  Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));

  File testFile = temp.newFile();
  testFile.delete();

  writeSchema = "message example {\n" +
          "required binary content (UTF8);\n" +
          "}";

  Path path = new Path(testFile.toURI());

  MessageType schema = MessageTypeParser.parseMessageType(writeSchema);
  Configuration configuration = new Configuration();
  configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
  GroupWriteSupport.setSchema(schema, configuration);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(path, configuration, new GroupWriteSupport());

  Group r1 = new SimpleGroup(schema);
  writer.write(r1);
  writer.close();

  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);

  // assert the statistics object is not empty
  org.apache.parquet.column.statistics.Statistics stats = readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
  assertFalse("is empty: " + stats, stats.isEmpty());
  // assert the number of nulls are correct for the first block
  assertEquals("nulls: " + stats, 1, stats.getNumNulls());
}
 
Example #19
Source File: TestSimpleRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void createTestParquetFile() throws IOException {
  Path fsPath = new Path(testFile().getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    writer.write(fact.newGroup()
     .append(INT32_FIELD, 32)
     .append(INT64_FIELD, 64L)
     .append(FLOAT_FIELD, 1.0f)
     .append(DOUBLE_FIELD, 2.0d)
     .append(BINARY_FIELD, Binary.fromString("foobar"))
     .append(FIXED_LEN_BYTE_ARRAY_FIELD,
       Binary.fromConstantByteArray(new byte[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 })));
  }
}
 
Example #20
Source File: TestMultipleWriteRead.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private Path writeFile(Iterable<Group> data) throws IOException {
  Path file = new Path(tmpDir, "testMultipleReadWrite_" + UUID.randomUUID() + ".parquet");
  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString())
      .build()) {
    for (Group group : data) {
      writer.write(group);
    }
  }
  return file;
}
 
Example #21
Source File: ParquetFileAccessor.java    From pxf with Apache License 2.0 5 votes vote down vote up
/**
 * Opens the resource for write.
 * Uses compression codec based on user input which
 * defaults to Snappy
 *
 * @return true if the resource is successfully opened
 * @throws IOException if opening the resource failed
 */
@Override
public boolean openForWrite() throws IOException {

    HcfsType hcfsType = HcfsType.getHcfsType(configuration, context);
    // skip codec extension in filePrefix, because we add it in this accessor
    filePrefix = hcfsType.getUriForWrite(configuration, context, true);
    String compressCodec = context.getOption("COMPRESSION_CODEC");
    codecName = codecFactory.getCodec(compressCodec, DEFAULT_COMPRESSION);

    // Options for parquet write
    pageSize = context.getOption("PAGE_SIZE", DEFAULT_PAGE_SIZE);
    rowGroupSize = context.getOption("ROWGROUP_SIZE", DEFAULT_ROWGROUP_SIZE);
    dictionarySize = context.getOption("DICTIONARY_PAGE_SIZE", DEFAULT_DICTIONARY_PAGE_SIZE);
    String parquetVerStr = context.getOption("PARQUET_VERSION");
    parquetVersion = parquetVerStr != null ? WriterVersion.fromString(parquetVerStr.toLowerCase()) : DEFAULT_PARQUET_VERSION;
    LOG.debug("{}-{}: Parquet options: PAGE_SIZE = {}, ROWGROUP_SIZE = {}, DICTIONARY_PAGE_SIZE = {}, PARQUET_VERSION = {}",
            context.getTransactionId(), context.getSegmentId(), pageSize, rowGroupSize, dictionarySize, parquetVersion);

    // Read schema file, if given
    String schemaFile = context.getOption("SCHEMA");
    MessageType schema = (schemaFile != null) ? readSchemaFile(schemaFile) :
            generateParquetSchema(context.getTupleDescription());
    LOG.debug("{}-{}: Schema fields = {}", context.getTransactionId(),
            context.getSegmentId(), schema.getFields());
    GroupWriteSupport.setSchema(schema, configuration);
    groupWriteSupport = new GroupWriteSupport();

    // We get the parquet schema and set it to the metadata in the request context
    // to avoid computing the schema again in the Resolver
    context.setMetadata(schema);
    createParquetWriter();
    return true;
}
 
Example #22
Source File: TestParquetWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testParquetFileWithBloomFilter() throws IOException {
  MessageType schema = Types.buildMessage().
    required(BINARY).as(stringType()).named("name").named("msg");

  String[] testNames = {"hello", "parquet", "bloom", "filter"};
  Configuration conf = new Configuration();
  GroupWriteSupport.setSchema(schema, conf);

  GroupFactory factory = new SimpleGroupFactory(schema);
  File file = temp.newFile();
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
    .withPageRowCountLimit(10)
    .withConf(conf)
    .withDictionaryEncoding(false)
    .withBloomFilterEnabled("name", true)
    .build()) {
    for (String testName : testNames) {
      writer.write(factory.newGroup().append("name", testName));
    }
  }

  ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()));
  BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
  BloomFilter bloomFilter = reader.getBloomFilterDataReader(blockMetaData)
    .readBloomFilter(blockMetaData.getColumns().get(0));

  for (String name: testNames) {
    assertTrue(bloomFilter.findHash(
      LongHashFunction.xx(0).hashBytes(Binary.fromString(name).toByteBuffer())));
  }
}
 
Example #23
Source File: TestParquetWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testNullValuesWithPageRowLimit() throws IOException {
  MessageType schema = Types.buildMessage().optionalList().optionalElement(BINARY).as(stringType()).named("str_list")
      .named("msg");
  final int recordCount = 100;
  Configuration conf = new Configuration();
  GroupWriteSupport.setSchema(schema, conf);

  GroupFactory factory = new SimpleGroupFactory(schema);
  Group listNull = factory.newGroup();

  File file = temp.newFile();
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
      .withPageRowCountLimit(10)
      .withConf(conf)
      .build()) {
    for (int i = 0; i < recordCount; ++i) {
      writer.write(listNull);
    }
  }

  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), path).build()) {
    int readRecordCount = 0;
    for (Group group = reader.read(); group != null; group = reader.read()) {
      assertEquals(listNull.toString(), group.toString());
      ++readRecordCount;
    }
    assertEquals("Number of written records should be equal to the read one", recordCount, readRecordCount);
  }
}
 
Example #24
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private String createParquetFile(String prefix) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, INT64, "Backward"),
      new PrimitiveType(REPEATED, INT64, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", 1l);
      g.add("Name", "foo");
      g.add("Gender", "male");
      Group links = g.addGroup("Links");
      links.add(0, 2l);
      links.add(1, 3l);
      writer.write(g);
    }
  }

  return file;
}
 
Example #25
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ParquetReader.Builder<Group> createReaderBuilder() throws IOException {
  ReadConfigurator readConfigurator = getReadConfigurator();
  return readConfigurator.configureBuilder(
      new ParquetReader.Builder<Group>(HadoopInputFile.fromPath(file, new Configuration())) {
        @Override
        protected ReadSupport<Group> getReadSupport() {
          return new GroupReadSupport();
        }
      }.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString()));
}
 
Example #26
Source File: DeprecatedOutputFormatTest.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public void configure(JobConf job) {
  factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(job));
}
 
Example #27
Source File: DeprecatedInputFormatTest.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
protected void setup(Context context) throws IOException, InterruptedException {
  factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(ContextUtil.getConfiguration(context)));
}
 
Example #28
Source File: TestParquetWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  Path root = new Path("target/tests/TestParquetWriter/");
  enforceEmptyDir(conf, root);
  MessageType schema = parseMessageType(
      "message test { "
      + "required binary binary_field; "
      + "required int32 int32_field; "
      + "required int64 int64_field; "
      + "required boolean boolean_field; "
      + "required float float_field; "
      + "required double double_field; "
      + "required fixed_len_byte_array(3) flba_field; "
      + "required int96 int96_field; "
      + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  Map<String, Encoding> expected = new HashMap<String, Encoding>();
  expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
  expected.put("1000-" + PARQUET_1_0, PLAIN);
  expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
  expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
  for (int modulo : asList(10, 1000)) {
    for (WriterVersion version : WriterVersion.values()) {
      Path file = new Path(root, version.name() + "_" + modulo);
      ParquetWriter<Group> writer = new ParquetWriter<Group>(
          file,
          new GroupWriteSupport(),
          UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
      for (int i = 0; i < 1000; i++) {
        writer.write(
            f.newGroup()
            .append("binary_field", "test" + (i % modulo))
            .append("int32_field", 32)
            .append("int64_field", 64l)
            .append("boolean_field", true)
            .append("float_field", 1.0f)
            .append("double_field", 2.0d)
            .append("flba_field", "foo")
            .append("int96_field", Binary.fromConstantByteArray(new byte[12])));
      }
      writer.close();
      ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
      for (int i = 0; i < 1000; i++) {
        Group group = reader.read();
        assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
        assertEquals(32, group.getInteger("int32_field", 0));
        assertEquals(64l, group.getLong("int64_field", 0));
        assertEquals(true, group.getBoolean("boolean_field", 0));
        assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
        assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
        assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
        assertEquals(Binary.fromConstantByteArray(new byte[12]),
            group.getInt96("int96_field",0));
      }
      reader.close();
      ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
      for (BlockMetaData blockMetaData : footer.getBlocks()) {
        for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
          if (column.getPath().toDotString().equals("binary_field")) {
            String key = modulo + "-" + version;
            Encoding expectedEncoding = expected.get(key);
            assertTrue(
                key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
                column.getEncodings().contains(expectedEncoding));
          }
        }
      }
      assertEquals("Object model property should be example",
          "example", footer.getFileMetaData().getKeyValueMetaData()
              .get(ParquetWriter.OBJECT_MODEL_NAME_PROP));
    }
  }
}
 
Example #29
Source File: TestParquetWriterNewPage.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  Path root = new Path("target/tests/TestParquetWriter/");
  FileSystem fs = root.getFileSystem(conf);
  if (fs.exists(root)) {
    fs.delete(root, true);
  }
  fs.mkdirs(root);
  MessageType schema = parseMessageType(
      "message test { "
      + "required binary binary_field; "
      + "required int32 int32_field; "
      + "required int64 int64_field; "
      + "required boolean boolean_field; "
      + "required float float_field; "
      + "required double double_field; "
      + "required fixed_len_byte_array(3) flba_field; "
      + "required int96 int96_field; "
      + "optional binary null_field; "
      + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  Map<String, Encoding> expected = new HashMap<String, Encoding>();
  expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
  expected.put("1000-" + PARQUET_1_0, PLAIN);
  expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
  expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
  for (int modulo : asList(10, 1000)) {
    for (WriterVersion version : WriterVersion.values()) {
      Path file = new Path(root, version.name() + "_" + modulo);
      ParquetWriter<Group> writer = new ParquetWriter<Group>(
          file,
          new GroupWriteSupport(),
          UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
      for (int i = 0; i < 1000; i++) {
        writer.write(
            f.newGroup()
            .append("binary_field", "test" + (i % modulo))
            .append("int32_field", 32)
            .append("int64_field", 64l)
            .append("boolean_field", true)
            .append("float_field", 1.0f)
            .append("double_field", 2.0d)
            .append("flba_field", "foo")
            .append("int96_field", Binary.fromConstantByteArray(new byte[12])));
      }
      writer.close();

      ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
      for (int i = 0; i < 1000; i++) {
        Group group = reader.read();
        assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
        assertEquals(32, group.getInteger("int32_field", 0));
        assertEquals(64l, group.getLong("int64_field", 0));
        assertEquals(true, group.getBoolean("boolean_field", 0));
        assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
        assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
        assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
        assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field",
            0));
        assertEquals(0, group.getFieldRepetitionCount("null_field"));
      }
      reader.close();
      ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
      for (BlockMetaData blockMetaData : footer.getBlocks()) {
        for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
          if (column.getPath().toDotString().equals("binary_field")) {
            String key = modulo + "-" + version;
            Encoding expectedEncoding = expected.get(key);
            assertTrue(
                key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
                column.getEncodings().contains(expectedEncoding));
          }
        }
      }
    }
  }
}
 
Example #30
Source File: TestZstandardCodec.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public void configure(JobConf job) {
  factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(job));
}