org.apache.parquet.io.OutputFile Java Examples

The following examples show how to use org.apache.parquet.io.OutputFile. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetAvroWriters.java    From flink with Apache License 2.0 5 votes vote down vote up
private static <T> ParquetWriter<T> createAvroParquetWriter(
		String schemaString,
		GenericData dataModel,
		OutputFile out) throws IOException {

	final Schema schema = new Schema.Parser().parse(schemaString);

	return AvroParquetWriter.<T>builder(out)
			.withSchema(schema)
			.withDataModel(dataModel)
			.build();
}
 
Example #2
Source File: WriteParquetResult.java    From nifi with Apache License 2.0 5 votes vote down vote up
public WriteParquetResult(final Schema schema, final OutputStream out, final ParquetConfig parquetConfig, final ComponentLog componentLogger) throws IOException {
    super(out);
    this.schema = schema;
    this.componentLogger = componentLogger;

    final Configuration conf = new Configuration();
    final OutputFile outputFile = new NifiParquetOutputFile(out);

    final AvroParquetWriter.Builder<GenericRecord> writerBuilder =
            AvroParquetWriter.<GenericRecord>builder(outputFile).withSchema(schema);
    applyCommonConfig(writerBuilder, conf, parquetConfig);
    parquetWriter = writerBuilder.build();
}
 
Example #3
Source File: ParquetWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
ParquetWriter(
    OutputFile file,
    ParquetFileWriter.Mode mode,
    WriteSupport<T> writeSupport,
    CompressionCodecName compressionCodecName,
    int rowGroupSize,
    boolean validating,
    Configuration conf,
    int maxPaddingSize,
    ParquetProperties encodingProps) throws IOException {

  WriteSupport.WriteContext writeContext = writeSupport.init(conf);
  MessageType schema = writeContext.getSchema();

  ParquetFileWriter fileWriter = new ParquetFileWriter(
    file, schema, mode, rowGroupSize, maxPaddingSize,
    encodingProps.getColumnIndexTruncateLength(), encodingProps.getStatisticsTruncateLength(),
    encodingProps.getPageWriteChecksumEnabled());
  fileWriter.start();

  this.codecFactory = new CodecFactory(conf, encodingProps.getPageSizeThreshold());
  CodecFactory.BytesCompressor compressor =	codecFactory.getCompressor(compressionCodecName);
  this.writer = new InternalParquetRecordWriter<T>(
      fileWriter,
      writeSupport,
      schema,
      writeContext.getExtraMetaData(),
      rowGroupSize,
      compressor,
      validating,
      encodingProps);
}
 
Example #4
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param file OutputFile to create or overwrite
 * @param schema the schema of the data
 * @param mode file creation mode
 * @param rowGroupSize the row group size
 * @param maxPaddingSize the maximum padding
 * @param columnIndexTruncateLength the length which the min/max values in column indexes tried to be truncated to
 * @param statisticsTruncateLength the length which the min/max values in row groups tried to be truncated to
 * @param pageWriteChecksumEnabled whether to write out page level checksums
 * @throws IOException if the file can not be created
 */
public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode,
                         long rowGroupSize, int maxPaddingSize, int columnIndexTruncateLength,
                         int statisticsTruncateLength, boolean pageWriteChecksumEnabled)
  throws IOException {
  TypeUtil.checkValidWriteSchema(schema);

  this.schema = schema;

  long blockSize = rowGroupSize;
  if (file.supportsBlockSize()) {
    blockSize = Math.max(file.defaultBlockSize(), rowGroupSize);
    this.alignment = PaddingAlignment.get(blockSize, rowGroupSize, maxPaddingSize);
  } else {
    this.alignment = NoAlignment.get(rowGroupSize);
  }

  if (mode == Mode.OVERWRITE) {
    this.out = file.createOrOverwrite(blockSize);
  } else {
    this.out = file.create(blockSize);
  }

  this.encodingStatsBuilder = new EncodingStats.Builder();
  this.columnIndexTruncateLength = columnIndexTruncateLength;
  this.pageWriteChecksumEnabled = pageWriteChecksumEnabled;
  this.crc = pageWriteChecksumEnabled ? new CRC32() : null;

  this.metadataConverter = new ParquetMetadataConverter(statisticsTruncateLength);
}
 
Example #5
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param file OutputFile to create or overwrite
 * @param schema the schema of the data
 * @param mode file creation mode
 * @param rowGroupSize the row group size
 * @param maxPaddingSize the maximum padding
 * @throws IOException if the file can not be created
 * @deprecated will be removed in 2.0.0
 */
@Deprecated
public ParquetFileWriter(OutputFile file, MessageType schema, Mode mode,
                         long rowGroupSize, int maxPaddingSize)
    throws IOException {
  this(file, schema, mode, rowGroupSize, maxPaddingSize,
      ParquetProperties.DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
    ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH,
      ParquetProperties.DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED);
}
 
Example #6
Source File: ParquetRowDataBuilder.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public ParquetWriter<RowData> createWriter(OutputFile out) throws IOException {
	Configuration conf = configuration.conf();
	return new ParquetRowDataBuilder(out, rowType, utcTimestamp)
			.withCompressionCodec(getParquetCompressionCodec(conf))
			.withRowGroupSize(getBlockSize(conf))
			.withPageSize(getPageSize(conf))
			.withDictionaryPageSize(getDictionaryPageSize(conf))
			.withMaxPaddingSize(conf.getInt(
					MAX_PADDING_BYTES, ParquetWriter.MAX_PADDING_SIZE_DEFAULT))
			.withDictionaryEncoding(getEnableDictionary(conf))
			.withValidation(getValidation(conf))
			.withWriterVersion(getWriterVersion(conf))
			.withConf(conf).build();
}
 
Example #7
Source File: ParquetRowDataBuilder.java    From flink with Apache License 2.0 5 votes vote down vote up
public ParquetRowDataBuilder(
		OutputFile path,
		RowType rowType,
		boolean utcTimestamp) {
	super(path);
	this.rowType = rowType;
	this.utcTimestamp = utcTimestamp;
}
 
Example #8
Source File: ParquetAvroWriters.java    From flink with Apache License 2.0 5 votes vote down vote up
private static <T> ParquetWriter<T> createAvroParquetWriter(
		String schemaString,
		GenericData dataModel,
		OutputFile out) throws IOException {

	final Schema schema = new Schema.Parser().parse(schemaString);

	return AvroParquetWriter.<T>builder(out)
			.withSchema(schema)
			.withDataModel(dataModel)
			.build();
}
 
Example #9
Source File: ParquetAvroWriters.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private static <T> ParquetWriter<T> createAvroParquetWriter(
		String schemaString,
		GenericData dataModel,
		OutputFile out) throws IOException {

	final Schema schema = new Schema.Parser().parse(schemaString);

	return AvroParquetWriter.<T>builder(out)
			.withSchema(schema)
			.withDataModel(dataModel)
			.build();
}
 
Example #10
Source File: ParquetIO.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private ParquetOutputFile(com.netflix.iceberg.io.OutputFile file) {
  this.file = file;
}
 
Example #11
Source File: ParquetIO.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private ParquetOutputFile(org.apache.iceberg.io.OutputFile file) {
  this.file = file;
}
 
Example #12
Source File: ParquetWriterFactory.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
public BulkWriter<T> create(FSDataOutputStream stream) throws IOException {
	final OutputFile out = new StreamOutputFile(stream);
	final ParquetWriter<T> writer = writerBuilder.createWriter(out);
	return new ParquetBulkWriter<>(writer);
}
 
Example #13
Source File: ParquetWriterFactory.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
public BulkWriter<T> create(FSDataOutputStream stream) throws IOException {
	final OutputFile out = new StreamOutputFile(stream);
	final ParquetWriter<T> writer = writerBuilder.createWriter(out);
	return new ParquetBulkWriter<>(writer);
}
 
Example #14
Source File: ExampleParquetWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private Builder(OutputFile file) {
  super(file);
}
 
Example #15
Source File: ParquetWriterFactory.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
@Override
public BulkWriter<T> create(FSDataOutputStream stream) throws IOException {
	final OutputFile out = new StreamOutputFile(stream);
	final ParquetWriter<T> writer = writerBuilder.createWriter(out);
	return new ParquetBulkWriter<>(writer);
}
 
Example #16
Source File: ParquetWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
protected Builder(OutputFile path) {
  this.file = path;
}
 
Example #17
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
/**
 * Test whether corruption in the page content is detected by checksum verification
 */
@Test
public void testCorruptedPage() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  InputFile inputFile = HadoopInputFile.fromPath(path, conf);
  try (SeekableInputStream inputStream = inputFile.newStream()) {
    int fileLen = (int) inputFile.getLength();
    byte[] fileBytes = new byte[fileLen];
    inputStream.readFully(fileBytes);
    inputStream.close();

    // There are 4 pages in total (2 per column), we corrupt the first page of the first column
    // and the second page of the second column. We do this by altering a byte roughly in the
    // middle of each page to be corrupted
    fileBytes[fileLen / 8]++;
    fileBytes[fileLen / 8 + ((fileLen / 4) * 3)]++;

    OutputFile outputFile = HadoopOutputFile.fromPath(path, conf);
    try (PositionOutputStream outputStream = outputFile.createOrOverwrite(1024 * 1024)) {
      outputStream.write(fileBytes);
      outputStream.close();

      // First we disable checksum verification, the corruption will go undetected as it is in the
      // data section of the page
      conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
      try (ParquetFileReader reader = getParquetFileReader(path, conf,
        Arrays.asList(colADesc, colBDesc))) {
        PageReadStore pageReadStore = reader.readNextRowGroup();

        DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
        assertFalse("Data in page was not corrupted",
          Arrays.equals(colAPage1.getBytes().toByteArray(), colAPage1Bytes));
        readNextPage(colADesc, pageReadStore);
        readNextPage(colBDesc, pageReadStore);
        DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
        assertFalse("Data in page was not corrupted",
          Arrays.equals(colBPage2.getBytes().toByteArray(), colBPage2Bytes));
      }

      // Now we enable checksum verification, the corruption should be detected
      conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
      try (ParquetFileReader reader =
             getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
        // We expect an exception on the first encountered corrupt page (in readAllPages)
        assertVerificationFailed(reader);
      }
    }
  }
}
 
Example #18
Source File: AvroParquetWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public static <T> Builder<T> builder(OutputFile file) {
  return new Builder<T>(file);
}
 
Example #19
Source File: AvroParquetWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private Builder(OutputFile file) {
  super(file);
}
 
Example #20
Source File: ParquetBuilder.java    From flink with Apache License 2.0 2 votes vote down vote up
/**
 * Creates and configures a parquet writer to the given output file.
 */
ParquetWriter<T> createWriter(OutputFile out) throws IOException;
 
Example #21
Source File: ParquetBuilder.java    From Flink-CEPplus with Apache License 2.0 2 votes vote down vote up
/**
 * Creates and configures a parquet writer to the given output file.
 */
ParquetWriter<T> createWriter(OutputFile out) throws IOException;
 
Example #22
Source File: ExampleParquetWriter.java    From parquet-mr with Apache License 2.0 2 votes vote down vote up
/**
 * Creates a Builder for configuring ParquetWriter with the example object
 * model. THIS IS AN EXAMPLE ONLY AND NOT INTENDED FOR USE.
 *
 * @param file the output file to create
 * @return a {@link Builder} to create a {@link ParquetWriter}
 */
public static Builder builder(OutputFile file) {
  return new Builder(file);
}
 
Example #23
Source File: ParquetBuilder.java    From flink with Apache License 2.0 2 votes vote down vote up
/**
 * Creates and configures a parquet writer to the given output file.
 */
ParquetWriter<T> createWriter(OutputFile out) throws IOException;