Java Code Examples for org.apache.parquet.hadoop.ParquetWriter#close()

The following examples show how to use org.apache.parquet.hadoop.ParquetWriter#close() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: IntegrationTestHelper.java    From circus-train with Apache License 2.0 9 votes vote down vote up
URI createData(
    URI tableUri,
    Schema schema,
    String hour,
    int id,
    String fieldName,
    Object data) throws IOException {
  GenericData.Record record = new GenericData.Record(schema);
  record.put("id", id);

  if (fieldName != null) {
    Schema.Field field = schema.getField(fieldName);
    Schema fieldSchema = field.schema();
    if (data instanceof Map) {
      GenericData.Record schemaRecord = new GenericData.Record(fieldSchema);
      ((Map<String, String>) data).forEach(schemaRecord::put);
      record.put(fieldName, schemaRecord);
    } else if (data != null) {
      record.put(fieldName, data);
    }
  }

  URI partition = URI.create(tableUri + "/hour=" + hour);
  String path = partition.getPath();
  File parentFolder = new File(path);
  parentFolder.mkdirs();
  File partitionFile = new File(parentFolder, "parquet0000");
  Path filePath = new Path(partitionFile.toURI());
  ParquetWriter<GenericData.Record> writer = AvroParquetWriter.<GenericData.Record>builder(filePath)
      .withSchema(schema)
      .withConf(new Configuration())
      .build();

  try {
    writer.write(record);
  } finally {
    writer.close();
  }
  return partition;
}
 
Example 2
Source File: HiveTestUtil.java    From hudi with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings({"unchecked", "deprecation"})
private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple)
    throws IOException, URISyntaxException {
  Schema schema = getTestDataSchema(isParquetSchemaSimple);
  org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
  BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1,
      BloomFilterTypeCode.SIMPLE.name());
  HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter);
  ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
      ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
      ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());

  List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100)
      : SchemaTestUtil.generateEvolvedTestRecords(100, 100));
  testRecords.forEach(s -> {
    try {
      writer.write(s);
    } catch (IOException e) {
      fail("IOException while writing test records as parquet" + e.toString());
    }
  });
  writer.close();
}
 
Example 3
Source File: TestHoodieAvroWriteSupport.java    From hudi with Apache License 2.0 6 votes vote down vote up
@Test
public void testAddKey(@TempDir java.nio.file.Path tempDir) throws IOException {
  List<String> rowKeys = new ArrayList<>();
  for (int i = 0; i < 1000; i++) {
    rowKeys.add(UUID.randomUUID().toString());
  }
  String filePath = tempDir.resolve("test.parquet").toAbsolutePath().toString();
  Schema schema = HoodieAvroUtils.getRecordKeySchema();
  BloomFilter filter = BloomFilterFactory.createBloomFilter(
      1000, 0.0001, 10000,
      BloomFilterTypeCode.SIMPLE.name());
  HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
      new AvroSchemaConverter().convert(schema), schema, filter);
  ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
      120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
  for (String rowKey : rowKeys) {
    GenericRecord rec = new GenericData.Record(schema);
    rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
    writer.write(rec);
    writeSupport.add(rowKey);
  }
  writer.close();
}
 
Example 4
Source File: TestParquetUtils.java    From hudi with Apache License 2.0 6 votes vote down vote up
private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys, Schema schema, boolean addPartitionPathField, String partitionPath) throws Exception {
  // Write out a parquet file
  BloomFilter filter = BloomFilterFactory
      .createBloomFilter(1000, 0.0001, 10000, typeCode);
  HoodieAvroWriteSupport writeSupport =
      new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
  ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
      120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
  for (String rowKey : rowKeys) {
    GenericRecord rec = new GenericData.Record(schema);
    rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
    if (addPartitionPathField) {
      rec.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath);
    }
    writer.write(rec);
    writeSupport.add(rowKey);
  }
  writer.close();
}
 
Example 5
Source File: SqlInterpreterTest.java    From zeppelin with Apache License 2.0 6 votes vote down vote up
public File createParquetFile(int[] values,
                              ParquetProperties.WriterVersion version) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".par");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType(
          "message test { "
                  + "required int32 int32_field; "
                  + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(
          path,
          new GroupWriteSupport(),
          CompressionCodecName.UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
  for (int i = 0; i < values.length; i++) {
    writer.write(f.newGroup()
            .append("int32_field", values[i]));
  }
  writer.close();
  return file;
}
 
Example 6
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static void writeAndTest(WriteContext context) throws IOException {
  // Create the configuration, and then apply the schema to our configuration.
  Configuration configuration = new Configuration();
  GroupWriteSupport.setSchema(context.schema, configuration);
  GroupWriteSupport groupWriteSupport = new GroupWriteSupport();

  // Create the writer properties
  final int blockSize = context.blockSize;
  final int pageSize = context.pageSize;
  final int dictionaryPageSize = pageSize;
  final boolean enableDictionary = context.enableDictionary;
  final boolean enableValidation = context.enableValidation;
  ParquetProperties.WriterVersion writerVersion = context.version;
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  ParquetWriter<Group> writer = new ParquetWriter<Group>(context.fsPath,
      groupWriteSupport, codec, blockSize, pageSize, dictionaryPageSize,
      enableDictionary, enableValidation, writerVersion, configuration);

  context.write(writer);
  writer.close();

  context.test();

  context.path.delete();
}
 
Example 7
Source File: DictionaryFilterTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void writeData(SimpleGroupFactory f, ParquetWriter<Group> writer) throws IOException {
  for (int i = 0; i < nElements; i++) {
    int index = i % ALPHABET.length();

    Group group = f.newGroup()
        .append("binary_field", ALPHABET.substring(index, index+1))
        .append("single_value_field", "sharp")
        .append("fixed_field", DECIMAL_VALUES[i % DECIMAL_VALUES.length])
        .append("int32_field", intValues[i % intValues.length])
        .append("int64_field", longValues[i % longValues.length])
        .append("double_field", toDouble(intValues[i % intValues.length]))
        .append("float_field", toFloat(intValues[i % intValues.length]))
        .append("plain_int32_field", i)
        .append("fallback_binary_field", i < (nElements / 2) ?
            ALPHABET.substring(index, index+1) : UUID.randomUUID().toString())
        .append("int96_field", INT96_VALUES[i % INT96_VALUES.length]);

    // 10% of the time, leave the field null
    if (index % 10 > 0) {
      group.append("optional_single_value_field", "sharp");
    }

    writer.write(group);
  }
  writer.close();
}
 
Example 8
Source File: TestReadWrite.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testNestedLists() throws Exception {
  Schema schema = new Schema.Parser().parse(
    Resources.getResource("nested_array.avsc").openStream());
  Path file = new Path(createTempFile().getPath());

  // Parquet writer
  ParquetWriter parquetWriter = AvroParquetWriter.builder(file).withSchema(schema)
    .withConf(testConf)
    .build();

  Schema innerRecordSchema = schema.getField("l1").schema().getTypes()
    .get(1).getElementType().getTypes().get(1);

  GenericRecord record = new GenericRecordBuilder(schema)
    .set("l1", Collections.singletonList(
      new GenericRecordBuilder(innerRecordSchema).set("l2", Collections.singletonList("hello")).build()
    ))
    .build();

  parquetWriter.write(record);
  parquetWriter.close();

  AvroParquetReader<GenericRecord> reader = new AvroParquetReader(testConf, file);
  GenericRecord nextRecord = reader.read();

  assertNotNull(nextRecord);
  assertNotNull(nextRecord.get("l1"));
  List l1List = (List) nextRecord.get("l1");
  assertNotNull(l1List.get(0));
  List l2List = (List) ((GenericRecord) l1List.get(0)).get("l2");
  assertEquals(str("hello"), l2List.get(0));
}
 
Example 9
Source File: TestUtil.java    From flink with Apache License 2.0 5 votes vote down vote up
public static Path createTempParquetFile(File folder, Schema schema, List<IndexedRecord> records) throws IOException {
	Path path = new Path(folder.getPath(), UUID.randomUUID().toString());
	ParquetWriter<IndexedRecord> writer = AvroParquetWriter.<IndexedRecord>builder(
		new org.apache.hadoop.fs.Path(path.toUri())).withSchema(schema).withRowGroupSize(10).build();

	for (IndexedRecord record : records) {
		writer.write(record);
	}

	writer.close();
	return path;
}
 
Example 10
Source File: ScroogeBinaryTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
@SuppressWarnings("unchecked")
public void testScroogeBinaryDecoding() throws Exception {
  StringAndBinary expected = new StringAndBinary.Immutable("test",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}));

  File temp = tempDir.newFile(UUID.randomUUID().toString());
  temp.deleteOnExit();
  temp.delete();

  Path path = new Path(temp.getPath());

  ParquetWriter<StringAndBinary> writer = new ParquetWriter<StringAndBinary>(
      path, new Configuration(), new ScroogeWriteSupport<StringAndBinary>(StringAndBinary.class));
  writer.write(expected);
  writer.close();

  Configuration conf = new Configuration();
  conf.set("parquet.thrift.converter.class", ScroogeRecordConverter.class.getName());
  ParquetReader<StringAndBinary> reader = ParquetReader.<StringAndBinary>
      builder(new ScroogeReadSupport(), path)
      .withConf(conf)
      .build();
  StringAndBinary record = reader.read();
  reader.close();

  Assert.assertEquals("String should match after serialization round trip",
      "test", record.s());
  Assert.assertEquals("ByteBuffer should match after serialization round trip",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}), record.b());
}
 
Example 11
Source File: ScroogeBinaryTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testScroogeBinaryEncoding() throws Exception {
  StringAndBinary expected = new StringAndBinary.Immutable("test",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}));

  File temp = tempDir.newFile(UUID.randomUUID().toString());
  temp.deleteOnExit();
  temp.delete();

  Path path = new Path(temp.getPath());

  ParquetWriter<StringAndBinary> writer = new ParquetWriter<StringAndBinary>(
      path, new Configuration(), new ScroogeWriteSupport<StringAndBinary>(StringAndBinary.class));
  writer.write(expected);
  writer.close();

  // read using the parquet-thrift version to isolate the write path
  ParquetReader<org.apache.parquet.thrift.test.binary.StringAndBinary> reader = ThriftParquetReader.<org.apache.parquet.thrift.test.binary.StringAndBinary>
      build(path)
      .withThriftClass(org.apache.parquet.thrift.test.binary.StringAndBinary.class)
      .build();
  org.apache.parquet.thrift.test.binary.StringAndBinary record = reader.read();
  reader.close();

  Assert.assertEquals("String should match after serialization round trip",
      "test", record.s);
  Assert.assertEquals("ByteBuffer should match after serialization round trip",
      ByteBuffer.wrap(new byte[] {-123, 20, 33}), record.b);
}
 
Example 12
Source File: TestFiltersWithMissingColumns.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Before
public void createDataFile() throws Exception {
  File file = temp.newFile("test.parquet");
  this.path = new Path(file.toString());

  MessageType type = Types.buildMessage()
      .required(INT64).named("id")
      .required(BINARY).as(UTF8).named("data")
      .named("test");

  SimpleGroupFactory factory = new SimpleGroupFactory(type);

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
      .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
      .withType(type)
      .build();

  try {
    for (long i = 0; i < 1000; i += 1) {
      Group g = factory.newGroup();
      g.add(0, i);
      g.add(1, "data-" + i);
      writer.write(g);
    }
  } finally {
    writer.close();
  }
}
 
Example 13
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Writes a set of values to a parquet file.
 * The ParquetWriter will write the values with dictionary encoding disabled so that we test specific encodings for
 */
private void writeValuesToFile(Path file, PrimitiveTypeName type, List<?> values, int rowGroupSize, int pageSize, boolean enableDictionary, WriterVersion version) throws IOException {
  MessageType schema;
  if (type == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
    schema = Types.buildMessage().required(type).length(FIXED_LENGTH).named("field").named("test");
  } else {
    schema = Types.buildMessage().required(type).named("field").named("test");
  }

  SimpleGroupFactory message = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, configuration);

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .withCompressionCodec(compression)
      .withRowGroupSize(rowGroupSize)
      .withPageSize(pageSize)
      .withDictionaryPageSize(TEST_DICT_PAGE_SIZE)
      .withDictionaryEncoding(enableDictionary)
      .withWriterVersion(version)
      .withConf(configuration)
      .build();

  for (Object o: values) {
    switch (type) {
      case BOOLEAN:
        writer.write(message.newGroup().append("field", (Boolean)o));
      break;
      case INT32:
        writer.write(message.newGroup().append("field", (Integer)o));
      break;
      case INT64:
        writer.write(message.newGroup().append("field", (Long)o));
      break;
      case FLOAT:
        writer.write(message.newGroup().append("field", (Float)o));
      break;
      case DOUBLE:
        writer.write(message.newGroup().append("field", (Double)o));
      break;
      case INT96:
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
        writer.write(message.newGroup().append("field", (Binary)o));
      break;
      default:
        throw new IllegalArgumentException("Unknown type name: " + type);
    }
  }

  writer.close();
}
 
Example 14
Source File: DirectWriterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
protected Path writeDirect(MessageType type, DirectWriter writer,
                         Map<String, String> metadata) throws IOException {
  File temp = tempDir.newFile(UUID.randomUUID().toString());
  temp.deleteOnExit();
  temp.delete();

  Path path = new Path(temp.getPath());

  ParquetWriter<Void> parquetWriter = new ParquetWriter<Void>(
      path, new DirectWriteSupport(type, writer, metadata));
  parquetWriter.write(null);
  parquetWriter.close();

  return path;
}
 
Example 15
Source File: PageChecksumDataGenerator.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void generateData(Path outFile, int nRows, boolean writeChecksums,
                         CompressionCodecName compression) throws IOException {
  if (exists(configuration, outFile)) {
    System.out.println("File already exists " + outFile);
    return;
  }

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(outFile)
    .withConf(configuration)
    .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
    .withCompressionCodec(compression)
    .withDictionaryEncoding(true)
    .withType(SCHEMA)
    .withPageWriteChecksumEnabled(writeChecksums)
    .build();

  GroupFactory groupFactory = new SimpleGroupFactory(SCHEMA);
  Random rand = new Random(42);
  for (int i = 0; i < nRows; i++) {
    Group group = groupFactory.newGroup();
    group
      .append("long_field", (long) i)
      .append("binary_field", randomUUID().toString())
      .addGroup("group")
      // Force dictionary encoding by performing modulo
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100);
    writer.write(group);
  }

  writer.close();
}
 
Example 16
Source File: TestUtil.java    From flink with Apache License 2.0 5 votes vote down vote up
public static Path createTempParquetFile(File folder, Schema schema, List<IndexedRecord> records) throws IOException {
	Path path = new Path(folder.getPath(), UUID.randomUUID().toString());
	ParquetWriter<IndexedRecord> writer = AvroParquetWriter.<IndexedRecord>builder(
		new org.apache.hadoop.fs.Path(path.toUri())).withSchema(schema).withRowGroupSize(10).build();

	for (IndexedRecord record : records) {
		writer.write(record);
	}

	writer.close();
	return path;
}
 
Example 17
Source File: TestParquetInLining.java    From hudi with Apache License 2.0 5 votes vote down vote up
@Test
public void testSimpleInlineFileSystem() throws IOException {
  Path outerInMemFSPath = getRandomOuterInMemPath();
  Path outerPath = new Path(FILE_SCHEME + outerInMemFSPath.toString().substring(outerInMemFSPath.toString().indexOf(':')));
  generatedPath = outerPath;
  ParquetWriter inlineWriter = new AvroParquetWriter(outerInMemFSPath, HoodieTestDataGenerator.AVRO_SCHEMA,
      CompressionCodecName.GZIP, 100 * 1024 * 1024, 1024 * 1024, true, inMemoryConf);
  // write few records
  List<GenericRecord> recordsToWrite = getParquetHoodieRecords();
  for (GenericRecord rec : recordsToWrite) {
    inlineWriter.write(rec);
  }
  inlineWriter.close();
  byte[] inlineBytes = getBytesToInline(outerInMemFSPath);
  long startOffset = generateOuterFile(outerPath, inlineBytes);

  long inlineLength = inlineBytes.length;

  // Generate phantom inline file
  Path inlinePath = getPhantomFile(outerPath, startOffset, inlineLength);

  // instantiate Parquet reader
  ParquetReader inLineReader = AvroParquetReader.builder(inlinePath).withConf(inlineConf).build();
  List<GenericRecord> records = readParquetGenericRecords(inLineReader);
  assertArrayEquals(recordsToWrite.toArray(), records.toArray());
  inLineReader.close();
}
 
Example 18
Source File: DataGenerator.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public void generateData(Path outFile, Configuration configuration, ParquetProperties.WriterVersion version,
                         int blockSize, int pageSize, int fixedLenByteArraySize, CompressionCodecName codec, int nRows)
        throws IOException
{
  if (exists(configuration, outFile)) {
    System.out.println("File already exists " + outFile);
    return;
  }

  System.out.println("Generating data @ " + outFile);

  MessageType schema = parseMessageType(
          "message test { "
                  + "required binary binary_field; "
                  + "required int32 int32_field; "
                  + "required int64 int64_field; "
                  + "required boolean boolean_field; "
                  + "required float float_field; "
                  + "required double double_field; "
                  + "required fixed_len_byte_array(" + fixedLenByteArraySize +") flba_field; "
                  + "required int96 int96_field; "
                  + "} ");

  GroupWriteSupport.setSchema(schema, configuration);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  ParquetWriter<Group> writer = new ParquetWriter<Group>(outFile, new GroupWriteSupport(), codec, blockSize,
                                                         pageSize, DICT_PAGE_SIZE, true, false, version, configuration);

  //generate some data for the fixed len byte array field
  char[] chars = new char[fixedLenByteArraySize];
  Arrays.fill(chars, '*');

  for (int i = 0; i < nRows; i++) {
    writer.write(
      f.newGroup()
        .append("binary_field", randomUUID().toString())
        .append("int32_field", i)
        .append("int64_field", 64l)
        .append("boolean_field", true)
        .append("float_field", 1.0f)
        .append("double_field", 2.0d)
        .append("flba_field", new String(chars))
        .append("int96_field", Binary.fromConstantByteArray(new byte[12]))
    );
  }
  writer.close();
}