org.apache.parquet.io.PositionOutputStream Java Examples

The following examples show how to use org.apache.parquet.io.PositionOutputStream. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void serializeBloomFilters(
  List<Map<String, BloomFilter>> bloomFilters,
  List<BlockMetaData> blocks,
  PositionOutputStream out) throws IOException {
  LOG.debug("{}: bloom filters", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
    if (blockBloomFilters.isEmpty()) continue;
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
      if (bloomFilter == null) {
        continue;
      }

      long offset = out.getPos();
      column.setBloomFilterOffset(offset);
      Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out);
      bloomFilter.writeTo(out);
    }
  }
}
 
Example #2
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void serializeOffsetIndexes(
    List<List<OffsetIndex>> offsetIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: offset indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<OffsetIndex> blockOffsetIndexes = offsetIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
      if (offsetIndex == null) {
        continue;
      }
      ColumnChunkMetaData column = columns.get(cIndex);
      long offset = out.getPos();
      Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out);
      column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}
 
Example #3
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void serializeColumnIndexes(
    List<List<ColumnIndex>> columnIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: column indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter
          .toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
      if (columnIndex == null) {
        continue;
      }
      long offset = out.getPos();
      Util.writeColumnIndex(columnIndex, out);
      column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}
 
Example #4
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Copy from a FS input stream to an output stream. Thread-safe
 *
 * @param from a {@link SeekableInputStream}
 * @param to any {@link PositionOutputStream}
 * @param start where in the from stream to start copying
 * @param length the number of bytes to copy
 * @throws IOException if there is an error while reading or writing
 */
private static void copy(SeekableInputStream from, PositionOutputStream to,
                         long start, long length) throws IOException{
  LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos());
  from.seek(start);
  long bytesCopied = 0;
  byte[] buffer = COPY_BUFFER.get();
  while (bytesCopied < length) {
    long bytesLeft = length - bytesCopied;
    int bytesRead = from.read(buffer, 0,
        (buffer.length < bytesLeft ? buffer.length : (int) bytesLeft));
    if (bytesRead < 0) {
      throw new IllegalArgumentException(
          "Unexpected end of input file at " + start + bytesCopied);
    }
    to.write(buffer, 0, bytesRead);
    bytesCopied += bytesRead;
  }
}
 
Example #5
Source File: StreamOutputFile.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) {
	if (used.compareAndSet(false, true)) {
		return new PositionOutputStreamAdapter(stream);
	}
	else {
		throw new IllegalStateException("A stream against this file was already created.");
	}
}
 
Example #6
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void serializeFooter(ParquetMetadata footer, PositionOutputStream out) throws IOException {
  long footerIndex = out.getPos();
  ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter();
  org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(CURRENT_VERSION, footer);
  writeFileMetaData(parquetMetadata, out);
  LOG.debug("{}: footer length = {}" , out.getPos(), (out.getPos() - footerIndex));
  BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex));
  out.write(MAGIC);
}
 
Example #7
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
private static void writeMetadataFile(Path outputPath, ParquetMetadata metadataFooter, FileSystem fs)
    throws IOException {
  PositionOutputStream metadata = HadoopStreams.wrap(fs.create(outputPath));
  metadata.write(MAGIC);
  serializeFooter(metadataFooter, metadata);
  metadata.close();
}
 
Example #8
Source File: StreamOutputFile.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) {
	if (used.compareAndSet(false, true)) {
		return new PositionOutputStreamAdapter(stream);
	}
	else {
		throw new IllegalStateException("A stream against this file was already created.");
	}
}
 
Example #9
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void alignForRowGroup(PositionOutputStream out) throws IOException {
  long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize);

  if (isPaddingNeeded(remaining)) {
    LOG.debug("Adding {} bytes of padding (row group size={}B, block size={}B)", remaining, rowGroupSize, dfsBlockSize);
    for (; remaining > 0; remaining -= zeros.length) {
      out.write(zeros, 0, (int) Math.min((long) zeros.length, remaining));
    }
  }
}
 
Example #10
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public long nextRowGroupSize(PositionOutputStream out) throws IOException {
  if (maxPaddingSize <= 0) {
    return rowGroupSize;
  }

  long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize);

  if (isPaddingNeeded(remaining)) {
    return rowGroupSize;
  }

  return Math.min(remaining, rowGroupSize);
}
 
Example #11
Source File: DataLoad.java    From arvo2parquet with MIT License 5 votes vote down vote up
private static void extractMetaDataFooter(final Path parquetFilePath) throws IOException {
  try (final ParquetFileReader rdr = ParquetFileReader.open(nioPathToInputFile(parquetFilePath))) {
    final ParquetMetadata footer = rdr.getFooter();
    final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE + "_dup.parquet");
    Files.deleteIfExists(metaDataOutPath);
    try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) {
      serializeFooter(footer, out);
    }
  }
}
 
Example #12
Source File: DataLoad.java    From arvo2parquet with MIT License 5 votes vote down vote up
private static void writeToParquet(@Nonnull final Schema schema,
                                   @Nonnull final Path fileToWrite,
                                   @Nonnull final GenericDataRecordSink sink) throws IOException
{
  try (final ParquetWriter<GenericData.Record> writer = createParquetWriterInstance(schema, fileToWrite)) {
    //noinspection StatementWithEmptyBody
    do ; while(sink.accept(writer::write));
    writer.close();
    final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE);
    Files.deleteIfExists(metaDataOutPath);
    try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) {
      serializeFooter(writer.getFooter(), out);
    }
  }
}
 
Example #13
Source File: StreamOutputFile.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) {
	if (used.compareAndSet(false, true)) {
		return new PositionOutputStreamAdapter(stream);
	}
	else {
		throw new IllegalStateException("A stream against this file was already created.");
	}
}
 
Example #14
Source File: HadoopOutputFile.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException {
  return HadoopStreams.wrap(fs.create(path, true /* overwrite if exists */,
      DFS_BUFFER_SIZE_DEFAULT, fs.getDefaultReplication(path),
      Math.max(fs.getDefaultBlockSize(path), blockSizeHint)));
}
 
Example #15
Source File: HadoopOutputFile.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) throws IOException {
  return HadoopStreams.wrap(fs.create(path, false /* do not overwrite */,
      DFS_BUFFER_SIZE_DEFAULT, fs.getDefaultReplication(path),
      Math.max(fs.getDefaultBlockSize(path), blockSizeHint)));
}
 
Example #16
Source File: TestColumnChunkPageWriteStore.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
PositionOutputStream out() {
  return out;
}
 
Example #17
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public long nextRowGroupSize(PositionOutputStream out) {
  return rowGroupSize;
}
 
Example #18
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public void alignForRowGroup(PositionOutputStream out) {
}
 
Example #19
Source File: TestColumnChunkPageWriteStore.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) throws IOException {
  return out = file.create(blockSizeHint);
}
 
Example #20
Source File: TestColumnChunkPageWriteStore.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException {
  return out = file.createOrOverwrite(blockSizeHint);
}
 
Example #21
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
/**
 * Test whether corruption in the page content is detected by checksum verification
 */
@Test
public void testCorruptedPage() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  InputFile inputFile = HadoopInputFile.fromPath(path, conf);
  try (SeekableInputStream inputStream = inputFile.newStream()) {
    int fileLen = (int) inputFile.getLength();
    byte[] fileBytes = new byte[fileLen];
    inputStream.readFully(fileBytes);
    inputStream.close();

    // There are 4 pages in total (2 per column), we corrupt the first page of the first column
    // and the second page of the second column. We do this by altering a byte roughly in the
    // middle of each page to be corrupted
    fileBytes[fileLen / 8]++;
    fileBytes[fileLen / 8 + ((fileLen / 4) * 3)]++;

    OutputFile outputFile = HadoopOutputFile.fromPath(path, conf);
    try (PositionOutputStream outputStream = outputFile.createOrOverwrite(1024 * 1024)) {
      outputStream.write(fileBytes);
      outputStream.close();

      // First we disable checksum verification, the corruption will go undetected as it is in the
      // data section of the page
      conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
      try (ParquetFileReader reader = getParquetFileReader(path, conf,
        Arrays.asList(colADesc, colBDesc))) {
        PageReadStore pageReadStore = reader.readNextRowGroup();

        DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
        assertFalse("Data in page was not corrupted",
          Arrays.equals(colAPage1.getBytes().toByteArray(), colAPage1Bytes));
        readNextPage(colADesc, pageReadStore);
        readNextPage(colBDesc, pageReadStore);
        DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
        assertFalse("Data in page was not corrupted",
          Arrays.equals(colBPage2.getBytes().toByteArray(), colBPage2Bytes));
      }

      // Now we enable checksum verification, the corruption should be detected
      conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
      try (ParquetFileReader reader =
             getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
        // We expect an exception on the first encountered corrupt page (in readAllPages)
        assertVerificationFailed(reader);
      }
    }
  }
}
 
Example #22
Source File: NifiParquetOutputFile.java    From nifi with Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) {
  return new NifiOutputStream(outputStream);
}
 
Example #23
Source File: NifiParquetOutputFile.java    From nifi with Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) {
  return new NifiOutputStream(outputStream);
}
 
Example #24
Source File: StreamOutputFile.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) {
	return create(blockSizeHint);
}
 
Example #25
Source File: ParquetIO.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) {
  return new BeamOutputStream(outputStream);
}
 
Example #26
Source File: ParquetIO.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) {
  return new BeamOutputStream(outputStream);
}
 
Example #27
Source File: OutputFile.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException {
  return new PositionOutputStreamWrapper(fs.create(path, true));
}
 
Example #28
Source File: OutputFile.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) throws IOException {
  return new PositionOutputStreamWrapper(fs.create(path));
}
 
Example #29
Source File: ParquetIO.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long ignored) throws IOException {
  return stream(file.createOrOverwrite());
}
 
Example #30
Source File: ParquetIO.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream create(long ignored) throws IOException {
  return stream(file.create());
}