org.apache.parquet.example.data.simple.SimpleGroupFactory Java Examples

The following examples show how to use org.apache.parquet.example.data.simple.SimpleGroupFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ApacheParquet.java    From sylph with Apache License 2.0 7 votes vote down vote up
private ApacheParquet(String outputPath, MessageType schema, WriterVersion writerVersion)
        throws IOException
{
    this.schema = schema;
    this.outputPath = outputPath;

    Configuration configuration = new Configuration();
    GroupWriteSupport.setSchema(schema, configuration);

    this.writer = ExampleParquetWriter.builder(new Path(outputPath))
            .withType(schema)
            .withConf(configuration)
            .withPageSize(DEFAULT_PAGE_SIZE)
            .withDictionaryPageSize(DEFAULT_PAGE_SIZE)
            .withDictionaryEncoding(DEFAULT_IS_DICTIONARY_ENABLED)
            .withValidation(DEFAULT_IS_VALIDATING_ENABLED)
            .withWriterVersion(writerVersion)
            .withRowGroupSize(DEFAULT_BLOCK_SIZE) // set Parquet file block size and page size values
            .withCompressionCodec(CompressionCodecName.UNCOMPRESSED) //压缩类型
            .build();

    this.groupFactory = new SimpleGroupFactory(this.schema);
}
 
Example #2
Source File: SqlInterpreterTest.java    From zeppelin with Apache License 2.0 6 votes vote down vote up
public File createParquetFile(int[] values,
                              ParquetProperties.WriterVersion version) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".par");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType(
          "message test { "
                  + "required int32 int32_field; "
                  + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(
          path,
          new GroupWriteSupport(),
          CompressionCodecName.UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
  for (int i = 0; i < values.length; i++) {
    writer.write(f.newGroup()
            .append("int32_field", values[i]));
  }
  writer.close();
  return file;
}
 
Example #3
Source File: DictionaryFilterTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void writeData(SimpleGroupFactory f, ParquetWriter<Group> writer) throws IOException {
  for (int i = 0; i < nElements; i++) {
    int index = i % ALPHABET.length();

    Group group = f.newGroup()
        .append("binary_field", ALPHABET.substring(index, index+1))
        .append("single_value_field", "sharp")
        .append("fixed_field", DECIMAL_VALUES[i % DECIMAL_VALUES.length])
        .append("int32_field", intValues[i % intValues.length])
        .append("int64_field", longValues[i % longValues.length])
        .append("double_field", toDouble(intValues[i % intValues.length]))
        .append("float_field", toFloat(intValues[i % intValues.length]))
        .append("plain_int32_field", i)
        .append("fallback_binary_field", i < (nElements / 2) ?
            ALPHABET.substring(index, index+1) : UUID.randomUUID().toString())
        .append("int96_field", INT96_VALUES[i % INT96_VALUES.length]);

    // 10% of the time, leave the field null
    if (index % 10 > 0) {
      group.append("optional_single_value_field", "sharp");
    }

    writer.write(group);
  }
  writer.close();
}
 
Example #4
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadUsingRequestedSchemaWithIncompatibleField(){
  MessageType originalSchema = new MessageType("schema",
          new PrimitiveType(OPTIONAL, INT32, "e"));
  MemPageStore store = new MemPageStore(1);
  SimpleGroupFactory groupFactory = new SimpleGroupFactory(originalSchema);
  writeGroups(originalSchema, store, groupFactory.newGroup().append("e", 4));

  try {
    MessageType schemaWithIncompatibleField = new MessageType("schema",
            new PrimitiveType(OPTIONAL, BINARY, "e")); // Incompatible schema: different type
    readGroups(store, originalSchema, schemaWithIncompatibleField, 1);
    fail("should have thrown an incompatible schema exception");
  } catch (ParquetDecodingException e) {
    assertEquals("The requested schema is not compatible with the file schema. incompatible types: optional binary e != optional int32 e", e.getMessage());
  }
}
 
Example #5
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadUsingSchemaWithRequiredFieldThatWasOptional(){
  MessageType originalSchema = new MessageType("schema",
          new PrimitiveType(OPTIONAL, INT32, "e"));
  MemPageStore store = new MemPageStore(1);
  SimpleGroupFactory groupFactory = new SimpleGroupFactory(originalSchema);
  writeGroups(originalSchema, store, groupFactory.newGroup().append("e", 4));

  try {
    MessageType schemaWithRequiredFieldThatWasOptional = new MessageType("schema",
            new PrimitiveType(REQUIRED, INT32, "e")); // Incompatible schema: required when it was optional
    readGroups(store, originalSchema, schemaWithRequiredFieldThatWasOptional, 1);
    fail("should have thrown an incompatible schema exception");
  } catch (ParquetDecodingException e) {
    assertEquals("The requested schema is not compatible with the file schema. incompatible types: required int32 e != optional int32 e", e.getMessage());
  }
}
 
Example #6
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadUsingProjectedSchema(){
  MessageType orginalSchema = new MessageType("schema",
          new PrimitiveType(REQUIRED, INT32, "a"),
          new PrimitiveType(REQUIRED, INT32, "b")
  );
  MessageType projectedSchema = new MessageType("schema",
          new PrimitiveType(OPTIONAL, INT32, "b")
  );
  MemPageStore store = new MemPageStore(1);
  SimpleGroupFactory groupFactory = new SimpleGroupFactory(orginalSchema);
  writeGroups(orginalSchema, store, groupFactory.newGroup().append("a", 1).append("b", 2));

  {
    List<Group> groups = new ArrayList<>();
    groups.addAll(readGroups(store, orginalSchema, projectedSchema, 1));
    Object[][] expected = {
            {2},
    };
    validateGroups(groups, expected);
  }
}
 
Example #7
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testOneOfEach() {
  MessageType oneOfEachSchema = MessageTypeParser.parseMessageType(oneOfEach);
  GroupFactory gf = new SimpleGroupFactory(oneOfEachSchema);
  Group g1 = gf.newGroup()
      .append("a", 1l)
      .append("b", 2)
      .append("c", 3.0f)
      .append("d", 4.0d)
      .append("e", true)
      .append("f", Binary.fromString("6"))
      .append("g", new NanoTime(1234, System.currentTimeMillis() * 1000))
      .append("h", Binary.fromString("abc"));

  testSchema(oneOfEachSchema, Arrays.asList(g1));
}
 
Example #8
Source File: ParquetFileTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void createTestParquetFile() throws IOException {
  File file = parquetFile();
  Path fsPath = new Path(file.getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    for (int i = 0; i < 10; i++) {
      final byte[] bytes = new byte[12];
      ThreadLocalRandom.current().nextBytes(bytes);

      writer.write(fact.newGroup()
       .append(INT32_FIELD, 32 + i)
       .append(INT64_FIELD, 64L + i)
       .append(FLOAT_FIELD, 1.0f + i)
       .append(DOUBLE_FIELD, 2.0d + i)
       .append(BINARY_FIELD, Binary.fromString(COLORS[i % COLORS.length]))
       .append(FIXED_LEN_BYTE_ARRAY_FIELD,
         Binary.fromConstantByteArray(bytes)));
    }
  }
}
 
Example #9
Source File: TestFiltersWithMissingColumns.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Before
public void createDataFile() throws Exception {
  File file = temp.newFile("test.parquet");
  this.path = new Path(file.toString());

  MessageType type = Types.buildMessage()
      .required(INT64).named("id")
      .required(BINARY).as(UTF8).named("data")
      .named("test");

  SimpleGroupFactory factory = new SimpleGroupFactory(type);

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
      .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
      .withType(type)
      .build();

  try {
    for (long i = 0; i < 1000; i += 1) {
      Group g = factory.newGroup();
      g.add(0, i);
      g.add(1, "data-" + i);
      writer.write(g);
    }
  } finally {
    writer.close();
  }
}
 
Example #10
Source File: DictionaryFilterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void prepareFile(WriterVersion version, Path file) throws IOException {
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .withWriterVersion(version)
      .withCompressionCodec(GZIP)
      .withRowGroupSize(1024*1024)
      .withPageSize(1024)
      .enableDictionaryEncoding()
      .withDictionaryPageSize(2*1024)
      .withConf(conf)
      .build();
  writeData(f, writer);
}
 
Example #11
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Writes a set of values to a parquet file.
 * The ParquetWriter will write the values with dictionary encoding disabled so that we test specific encodings for
 */
private void writeValuesToFile(Path file, PrimitiveTypeName type, List<?> values, int rowGroupSize, int pageSize, boolean enableDictionary, WriterVersion version) throws IOException {
  MessageType schema;
  if (type == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
    schema = Types.buildMessage().required(type).length(FIXED_LENGTH).named("field").named("test");
  } else {
    schema = Types.buildMessage().required(type).named("field").named("test");
  }

  SimpleGroupFactory message = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, configuration);

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .withCompressionCodec(compression)
      .withRowGroupSize(rowGroupSize)
      .withPageSize(pageSize)
      .withDictionaryPageSize(TEST_DICT_PAGE_SIZE)
      .withDictionaryEncoding(enableDictionary)
      .withWriterVersion(version)
      .withConf(configuration)
      .build();

  for (Object o: values) {
    switch (type) {
      case BOOLEAN:
        writer.write(message.newGroup().append("field", (Boolean)o));
      break;
      case INT32:
        writer.write(message.newGroup().append("field", (Integer)o));
      break;
      case INT64:
        writer.write(message.newGroup().append("field", (Long)o));
      break;
      case FLOAT:
        writer.write(message.newGroup().append("field", (Float)o));
      break;
      case DOUBLE:
        writer.write(message.newGroup().append("field", (Double)o));
      break;
      case INT96:
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
        writer.write(message.newGroup().append("field", (Binary)o));
      break;
      default:
        throw new IllegalArgumentException("Unknown type name: " + type);
    }
  }

  writer.close();
}
 
Example #12
Source File: TestMergeMetadataFiles.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void writeFile(File out, Configuration conf, boolean useSchema2) throws IOException {
  if (!useSchema2) {
    GroupWriteSupport.setSchema(schema, conf);
  } else {
    GroupWriteSupport.setSchema(schema2, conf);
  }
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  Map<String, String> extraMetaData = new HashMap<String, String>();
  extraMetaData.put("schema_num", useSchema2 ? "2" : "1" );

  ParquetWriter<Group> writer = ExampleParquetWriter
      .builder(new Path(out.getAbsolutePath()))
      .withConf(conf)
      .withExtraMetaData(extraMetaData)
      .build();

    for (int i = 0; i < 1000; i++) {
      Group g = f.newGroup()
          .append("binary_field", "test" + i)
          .append("int32_field", i)
          .append("int64_field", (long) i)
          .append("boolean_field", i % 2 == 0)
          .append("float_field", (float) i)
          .append("double_field", (double)i)
          .append("flba_field", "foo");

      if (!useSchema2) {
        g = g.append("int96_field", Binary.fromConstantByteArray(new byte[12]));
      }

      writer.write(g);
    }
    writer.close();
}
 
Example #13
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private Path writeNestedWithNullsSampleParquetFile(Configuration conf,
                                                   boolean dictionaryEncoding,
                                                   CompressionCodecName compression)
  throws IOException {
  File file = tempFolder.newFile();
  file.delete();
  Path path = new Path(file.toURI());

  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
    .withConf(conf)
    .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
    .withCompressionCodec(compression)
    .withDictionaryEncoding(dictionaryEncoding)
    .withType(schemaNestedWithNulls)
    .withPageWriteChecksumEnabled(ParquetOutputFormat.getPageWriteChecksumEnabled(conf))
    .build()) {
    GroupFactory groupFactory = new SimpleGroupFactory(schemaNestedWithNulls);
    Random rand = new Random(42);

    for (int i = 0; i < numRecordsNestedWithNullsFile; i++) {
      Group group = groupFactory.newGroup();
      if (rand.nextDouble() > nullRatio) {
        // With equal probability, write out either 1 or 3 values in group e. To ensure our values
        // are dictionary encoded when required, perform modulo.
        if (rand.nextDouble() > 0.5) {
          group.addGroup("c").append("id", (long) i).addGroup("d")
            .append("val", rand.nextInt() % 10);
        } else {
          group.addGroup("c").append("id", (long) i).addGroup("d")
            .append("val", rand.nextInt() % 10)
            .append("val", rand.nextInt() % 10)
            .append("val", rand.nextInt() % 10);
        }
      }
      writer.write(group);
    }
  }

  return path;
}
 
Example #14
Source File: TestParquetWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testParquetFileWithBloomFilter() throws IOException {
  MessageType schema = Types.buildMessage().
    required(BINARY).as(stringType()).named("name").named("msg");

  String[] testNames = {"hello", "parquet", "bloom", "filter"};
  Configuration conf = new Configuration();
  GroupWriteSupport.setSchema(schema, conf);

  GroupFactory factory = new SimpleGroupFactory(schema);
  File file = temp.newFile();
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
    .withPageRowCountLimit(10)
    .withConf(conf)
    .withDictionaryEncoding(false)
    .withBloomFilterEnabled("name", true)
    .build()) {
    for (String testName : testNames) {
      writer.write(factory.newGroup().append("name", testName));
    }
  }

  ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()));
  BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
  BloomFilter bloomFilter = reader.getBloomFilterDataReader(blockMetaData)
    .readBloomFilter(blockMetaData.getColumns().get(0));

  for (String name: testNames) {
    assertTrue(bloomFilter.findHash(
      LongHashFunction.xx(0).hashBytes(Binary.fromString(name).toByteBuffer())));
  }
}
 
Example #15
Source File: TestParquetWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testNullValuesWithPageRowLimit() throws IOException {
  MessageType schema = Types.buildMessage().optionalList().optionalElement(BINARY).as(stringType()).named("str_list")
      .named("msg");
  final int recordCount = 100;
  Configuration conf = new Configuration();
  GroupWriteSupport.setSchema(schema, conf);

  GroupFactory factory = new SimpleGroupFactory(schema);
  Group listNull = factory.newGroup();

  File file = temp.newFile();
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
      .withPageRowCountLimit(10)
      .withConf(conf)
      .build()) {
    for (int i = 0; i < recordCount; ++i) {
      writer.write(listNull);
    }
  }

  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), path).build()) {
    int readRecordCount = 0;
    for (Group group = reader.read(); group != null; group = reader.read()) {
      assertEquals(listNull.toString(), group.toString());
      ++readRecordCount;
    }
    assertEquals("Number of written records should be equal to the read one", recordCount, readRecordCount);
  }
}
 
Example #16
Source File: TestReadWriteEncodingStats.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void writeData(ParquetWriter<Group> writer) throws IOException {
  SimpleGroupFactory f = new SimpleGroupFactory(SCHEMA);
  for (int i = 0; i < NUM_RECORDS; i += 1) {
    int index = i % ALPHABET.length();

    Group group = f.newGroup()
        .append("dict_binary_field", ALPHABET.substring(index, index+1))
        .append("plain_int32_field", i)
        .append("fallback_binary_field", i < (NUM_RECORDS / 2) ?
            ALPHABET.substring(index, index+1) : UUID.randomUUID().toString());

    writer.write(group);
  }
}
 
Example #17
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testRequiredOfRequired() {
  MessageType reqreqSchema = MessageTypeParser.parseMessageType(
        "message Document {\n"
      + "  required group foo {\n"
      + "    required int64 bar;\n"
      + "  }\n"
      + "}\n");

  GroupFactory gf = new SimpleGroupFactory(reqreqSchema);
  Group g1 = gf.newGroup();
  g1.addGroup("foo").append("bar", 2l);

  testSchema(reqreqSchema, Arrays.asList(g1));
}
 
Example #18
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
  public void testReadUsingRequestedSchemaWithExtraFields(){
    MessageType orginalSchema = new MessageType("schema",
            new PrimitiveType(REQUIRED, INT32, "a"),
            new PrimitiveType(OPTIONAL, INT32, "b")
    );
    MessageType schemaWithExtraField = new MessageType("schema",
            new PrimitiveType(OPTIONAL, INT32, "b"),
            new PrimitiveType(OPTIONAL, INT32, "a"),
            new PrimitiveType(OPTIONAL, INT32, "c")
    );
    MemPageStore memPageStoreForOriginalSchema = new MemPageStore(1);
    MemPageStore memPageStoreForSchemaWithExtraField = new MemPageStore(1);
    SimpleGroupFactory groupFactory = new SimpleGroupFactory(orginalSchema);
    writeGroups(orginalSchema, memPageStoreForOriginalSchema, groupFactory.newGroup().append("a", 1).append("b", 2));

    SimpleGroupFactory groupFactory2 = new SimpleGroupFactory(schemaWithExtraField);
    writeGroups(schemaWithExtraField, memPageStoreForSchemaWithExtraField, groupFactory2.newGroup().append("a", 1).append("b", 2).append("c",3));

    {
      List<Group> groups = new ArrayList<>();
      groups.addAll(readGroups(memPageStoreForOriginalSchema, orginalSchema, schemaWithExtraField, 1));
      groups.addAll(readGroups(memPageStoreForSchemaWithExtraField, schemaWithExtraField, schemaWithExtraField, 1));
      // TODO: add once we have the support for empty projection
//      groups1.addAll(readGroups(memPageStore3, schema3, schema2, 1));
      Object[][] expected = {
              { 2, 1, null},
              { 2, 1, 3},
//          { null, null}
      };
      validateGroups(groups, expected);
    }
  }
 
Example #19
Source File: GroupRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public GroupRecordConverter(MessageType schema) {
  this.simpleGroupFactory = new SimpleGroupFactory(schema);
  this.root = new SimpleGroupConverter(null, 0, schema) {
    @Override
    public void start() {
      this.current = simpleGroupFactory.newGroup();
    }

    @Override
    public void end() {
    }
  };
}
 
Example #20
Source File: PageChecksumDataGenerator.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void generateData(Path outFile, int nRows, boolean writeChecksums,
                         CompressionCodecName compression) throws IOException {
  if (exists(configuration, outFile)) {
    System.out.println("File already exists " + outFile);
    return;
  }

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(outFile)
    .withConf(configuration)
    .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
    .withCompressionCodec(compression)
    .withDictionaryEncoding(true)
    .withType(SCHEMA)
    .withPageWriteChecksumEnabled(writeChecksums)
    .build();

  GroupFactory groupFactory = new SimpleGroupFactory(SCHEMA);
  Random rand = new Random(42);
  for (int i = 0; i < nRows; i++) {
    Group group = groupFactory.newGroup();
    group
      .append("long_field", (long) i)
      .append("binary_field", randomUUID().toString())
      .addGroup("group")
      // Force dictionary encoding by performing modulo
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100);
    writer.write(group);
  }

  writer.close();
}
 
Example #21
Source File: TestSimpleRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void createTestParquetFile() throws IOException {
  Path fsPath = new Path(testFile().getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    writer.write(fact.newGroup()
     .append(INT32_FIELD, 32)
     .append(INT64_FIELD, 64L)
     .append(FLOAT_FIELD, 1.0f)
     .append(DOUBLE_FIELD, 2.0d)
     .append(BINARY_FIELD, Binary.fromString("foobar"))
     .append(FIXED_LEN_BYTE_ARRAY_FIELD,
       Binary.fromConstantByteArray(new byte[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 })));
  }
}
 
Example #22
Source File: TestParquetWriterNewPage.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  Path root = new Path("target/tests/TestParquetWriter/");
  FileSystem fs = root.getFileSystem(conf);
  if (fs.exists(root)) {
    fs.delete(root, true);
  }
  fs.mkdirs(root);
  MessageType schema = parseMessageType(
      "message test { "
      + "required binary binary_field; "
      + "required int32 int32_field; "
      + "required int64 int64_field; "
      + "required boolean boolean_field; "
      + "required float float_field; "
      + "required double double_field; "
      + "required fixed_len_byte_array(3) flba_field; "
      + "required int96 int96_field; "
      + "optional binary null_field; "
      + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  Map<String, Encoding> expected = new HashMap<String, Encoding>();
  expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
  expected.put("1000-" + PARQUET_1_0, PLAIN);
  expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
  expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
  for (int modulo : asList(10, 1000)) {
    for (WriterVersion version : WriterVersion.values()) {
      Path file = new Path(root, version.name() + "_" + modulo);
      ParquetWriter<Group> writer = new ParquetWriter<Group>(
          file,
          new GroupWriteSupport(),
          UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
      for (int i = 0; i < 1000; i++) {
        writer.write(
            f.newGroup()
            .append("binary_field", "test" + (i % modulo))
            .append("int32_field", 32)
            .append("int64_field", 64l)
            .append("boolean_field", true)
            .append("float_field", 1.0f)
            .append("double_field", 2.0d)
            .append("flba_field", "foo")
            .append("int96_field", Binary.fromConstantByteArray(new byte[12])));
      }
      writer.close();

      ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
      for (int i = 0; i < 1000; i++) {
        Group group = reader.read();
        assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
        assertEquals(32, group.getInteger("int32_field", 0));
        assertEquals(64l, group.getLong("int64_field", 0));
        assertEquals(true, group.getBoolean("boolean_field", 0));
        assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
        assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
        assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
        assertEquals(Binary.fromConstantByteArray(new byte[12]), group.getInt96("int96_field",
            0));
        assertEquals(0, group.getFieldRepetitionCount("null_field"));
      }
      reader.close();
      ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
      for (BlockMetaData blockMetaData : footer.getBlocks()) {
        for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
          if (column.getPath().toDotString().equals("binary_field")) {
            String key = modulo + "-" + version;
            Encoding expectedEncoding = expected.get(key);
            assertTrue(
                key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
                column.getEncodings().contains(expectedEncoding));
          }
        }
      }
    }
  }
}
 
Example #23
Source File: TestParquetWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void test() throws Exception {
  Configuration conf = new Configuration();
  Path root = new Path("target/tests/TestParquetWriter/");
  enforceEmptyDir(conf, root);
  MessageType schema = parseMessageType(
      "message test { "
      + "required binary binary_field; "
      + "required int32 int32_field; "
      + "required int64 int64_field; "
      + "required boolean boolean_field; "
      + "required float float_field; "
      + "required double double_field; "
      + "required fixed_len_byte_array(3) flba_field; "
      + "required int96 int96_field; "
      + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  Map<String, Encoding> expected = new HashMap<String, Encoding>();
  expected.put("10-" + PARQUET_1_0, PLAIN_DICTIONARY);
  expected.put("1000-" + PARQUET_1_0, PLAIN);
  expected.put("10-" + PARQUET_2_0, RLE_DICTIONARY);
  expected.put("1000-" + PARQUET_2_0, DELTA_BYTE_ARRAY);
  for (int modulo : asList(10, 1000)) {
    for (WriterVersion version : WriterVersion.values()) {
      Path file = new Path(root, version.name() + "_" + modulo);
      ParquetWriter<Group> writer = new ParquetWriter<Group>(
          file,
          new GroupWriteSupport(),
          UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
      for (int i = 0; i < 1000; i++) {
        writer.write(
            f.newGroup()
            .append("binary_field", "test" + (i % modulo))
            .append("int32_field", 32)
            .append("int64_field", 64l)
            .append("boolean_field", true)
            .append("float_field", 1.0f)
            .append("double_field", 2.0d)
            .append("flba_field", "foo")
            .append("int96_field", Binary.fromConstantByteArray(new byte[12])));
      }
      writer.close();
      ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).withConf(conf).build();
      for (int i = 0; i < 1000; i++) {
        Group group = reader.read();
        assertEquals("test" + (i % modulo), group.getBinary("binary_field", 0).toStringUsingUTF8());
        assertEquals(32, group.getInteger("int32_field", 0));
        assertEquals(64l, group.getLong("int64_field", 0));
        assertEquals(true, group.getBoolean("boolean_field", 0));
        assertEquals(1.0f, group.getFloat("float_field", 0), 0.001);
        assertEquals(2.0d, group.getDouble("double_field", 0), 0.001);
        assertEquals("foo", group.getBinary("flba_field", 0).toStringUsingUTF8());
        assertEquals(Binary.fromConstantByteArray(new byte[12]),
            group.getInt96("int96_field",0));
      }
      reader.close();
      ParquetMetadata footer = readFooter(conf, file, NO_FILTER);
      for (BlockMetaData blockMetaData : footer.getBlocks()) {
        for (ColumnChunkMetaData column : blockMetaData.getColumns()) {
          if (column.getPath().toDotString().equals("binary_field")) {
            String key = modulo + "-" + version;
            Encoding expectedEncoding = expected.get(key);
            assertTrue(
                key + ":" + column.getEncodings() + " should contain " + expectedEncoding,
                column.getEncodings().contains(expectedEncoding));
          }
        }
      }
      assertEquals("Object model property should be example",
          "example", footer.getFileMetaData().getKeyValueMetaData()
              .get(ParquetWriter.OBJECT_MODEL_NAME_PROP));
    }
  }
}
 
Example #24
Source File: TestZstandardCodec.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public void configure(JobConf job) {
  factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(job));
}
 
Example #25
Source File: DeprecatedOutputFormatTest.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public void configure(JobConf job) {
  factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(job));
}
 
Example #26
Source File: TestInputOutputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
protected void setup(org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Void, Group>.Context context) throws java.io.IOException, InterruptedException {
  factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(ContextUtil.getConfiguration(context)));
}
 
Example #27
Source File: DeprecatedInputFormatTest.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
protected void setup(Context context) throws IOException, InterruptedException {
  factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(ContextUtil.getConfiguration(context)));
}
 
Example #28
Source File: DataGenerator.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public void generateData(Path outFile, Configuration configuration, ParquetProperties.WriterVersion version,
                         int blockSize, int pageSize, int fixedLenByteArraySize, CompressionCodecName codec, int nRows)
        throws IOException
{
  if (exists(configuration, outFile)) {
    System.out.println("File already exists " + outFile);
    return;
  }

  System.out.println("Generating data @ " + outFile);

  MessageType schema = parseMessageType(
          "message test { "
                  + "required binary binary_field; "
                  + "required int32 int32_field; "
                  + "required int64 int64_field; "
                  + "required boolean boolean_field; "
                  + "required float float_field; "
                  + "required double double_field; "
                  + "required fixed_len_byte_array(" + fixedLenByteArraySize +") flba_field; "
                  + "required int96 int96_field; "
                  + "} ");

  GroupWriteSupport.setSchema(schema, configuration);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  ParquetWriter<Group> writer = new ParquetWriter<Group>(outFile, new GroupWriteSupport(), codec, blockSize,
                                                         pageSize, DICT_PAGE_SIZE, true, false, version, configuration);

  //generate some data for the fixed len byte array field
  char[] chars = new char[fixedLenByteArraySize];
  Arrays.fill(chars, '*');

  for (int i = 0; i < nRows; i++) {
    writer.write(
      f.newGroup()
        .append("binary_field", randomUUID().toString())
        .append("int32_field", i)
        .append("int64_field", 64l)
        .append("boolean_field", true)
        .append("float_field", 1.0f)
        .append("double_field", 2.0d)
        .append("flba_field", new String(chars))
        .append("int96_field", Binary.fromConstantByteArray(new byte[12]))
    );
  }
  writer.close();
}