org.apache.parquet.io.PositionOutputStream Java Exaples

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeBloomFilters(
  List<Map<String, BloomFilter>> bloomFilters,
  List<BlockMetaData> blocks,
  PositionOutputStream out) throws IOException {
  LOG.debug("{}: bloom filters", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
    if (blockBloomFilters.isEmpty()) continue;
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
      if (bloomFilter == null) {
        continue;
      }

      long offset = out.getPos();
      column.setBloomFilterOffset(offset);
      Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out);
      bloomFilter.writeTo(out);
    }
  }
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeOffsetIndexes(
    List<List<OffsetIndex>> offsetIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: offset indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<OffsetIndex> blockOffsetIndexes = offsetIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
      if (offsetIndex == null) {
        continue;
      }
      ColumnChunkMetaData column = columns.get(cIndex);
      long offset = out.getPos();
      Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out);
      column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeColumnIndexes(
    List<List<ColumnIndex>> columnIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: column indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter
          .toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
      if (columnIndex == null) {
        continue;
      }
      long offset = out.getPos();
      Util.writeColumnIndex(columnIndex, out);
      column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

/**
 * Copy from a FS input stream to an output stream. Thread-safe
 *
 * @param from a {@link SeekableInputStream}
 * @param to any {@link PositionOutputStream}
 * @param start where in the from stream to start copying
 * @param length the number of bytes to copy
 * @throws IOException if there is an error while reading or writing
 */
private static void copy(SeekableInputStream from, PositionOutputStream to,
                         long start, long length) throws IOException{
  LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos());
  from.seek(start);
  long bytesCopied = 0;
  byte[] buffer = COPY_BUFFER.get();
  while (bytesCopied < length) {
    long bytesLeft = length - bytesCopied;
    int bytesRead = from.read(buffer, 0,
        (buffer.length < bytesLeft ? buffer.length : (int) bytesLeft));
    if (bytesRead < 0) {
      throw new IllegalArgumentException(
          "Unexpected end of input file at " + start + bytesCopied);
    }
    to.write(buffer, 0, bytesRead);
    bytesCopied += bytesRead;
  }
}

Source File: StreamOutputFile.java From Flink-CEPplus with Apache License 2.0

5 votes

@Override
public PositionOutputStream create(long blockSizeHint) {
	if (used.compareAndSet(false, true)) {
		return new PositionOutputStreamAdapter(stream);
	}
	else {
		throw new IllegalStateException("A stream against this file was already created.");
	}
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

private static void serializeFooter(ParquetMetadata footer, PositionOutputStream out) throws IOException {
  long footerIndex = out.getPos();
  ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter();
  org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(CURRENT_VERSION, footer);
  writeFileMetaData(parquetMetadata, out);
  LOG.debug("{}: footer length = {}" , out.getPos(), (out.getPos() - footerIndex));
  BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex));
  out.write(MAGIC);
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

/**
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
private static void writeMetadataFile(Path outputPath, ParquetMetadata metadataFooter, FileSystem fs)
    throws IOException {
  PositionOutputStream metadata = HadoopStreams.wrap(fs.create(outputPath));
  metadata.write(MAGIC);
  serializeFooter(metadataFooter, metadata);
  metadata.close();
}

Source File: StreamOutputFile.java From flink with Apache License 2.0

5 votes

@Override
public PositionOutputStream create(long blockSizeHint) {
	if (used.compareAndSet(false, true)) {
		return new PositionOutputStreamAdapter(stream);
	}
	else {
		throw new IllegalStateException("A stream against this file was already created.");
	}
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void alignForRowGroup(PositionOutputStream out) throws IOException {
  long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize);

  if (isPaddingNeeded(remaining)) {
    LOG.debug("Adding {} bytes of padding (row group size={}B, block size={}B)", remaining, rowGroupSize, dfsBlockSize);
    for (; remaining > 0; remaining -= zeros.length) {
      out.write(zeros, 0, (int) Math.min((long) zeros.length, remaining));
    }
  }
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

@Override
public long nextRowGroupSize(PositionOutputStream out) throws IOException {
  if (maxPaddingSize <= 0) {
    return rowGroupSize;
  }

  long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize);

  if (isPaddingNeeded(remaining)) {
    return rowGroupSize;
  }

  return Math.min(remaining, rowGroupSize);
}

Source File: DataLoad.java From arvo2parquet with MIT License

5 votes

private static void extractMetaDataFooter(final Path parquetFilePath) throws IOException {
  try (final ParquetFileReader rdr = ParquetFileReader.open(nioPathToInputFile(parquetFilePath))) {
    final ParquetMetadata footer = rdr.getFooter();
    final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE + "_dup.parquet");
    Files.deleteIfExists(metaDataOutPath);
    try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) {
      serializeFooter(footer, out);
    }
  }
}

Source File: DataLoad.java From arvo2parquet with MIT License

5 votes

private static void writeToParquet(@Nonnull final Schema schema,
                                   @Nonnull final Path fileToWrite,
                                   @Nonnull final GenericDataRecordSink sink) throws IOException
{
  try (final ParquetWriter<GenericData.Record> writer = createParquetWriterInstance(schema, fileToWrite)) {
    //noinspection StatementWithEmptyBody
    do ; while(sink.accept(writer::write));
    writer.close();
    final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE);
    Files.deleteIfExists(metaDataOutPath);
    try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) {
      serializeFooter(writer.getFooter(), out);
    }
  }
}

Source File: StreamOutputFile.java From flink with Apache License 2.0

5 votes

@Override
public PositionOutputStream create(long blockSizeHint) {
	if (used.compareAndSet(false, true)) {
		return new PositionOutputStreamAdapter(stream);
	}
	else {
		throw new IllegalStateException("A stream against this file was already created.");
	}
}

Source File: HadoopOutputFile.java From parquet-mr with Apache License 2.0

4 votes

@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException {
  return HadoopStreams.wrap(fs.create(path, true /* overwrite if exists */,
      DFS_BUFFER_SIZE_DEFAULT, fs.getDefaultReplication(path),
      Math.max(fs.getDefaultBlockSize(path), blockSizeHint)));
}

Source File: HadoopOutputFile.java From parquet-mr with Apache License 2.0

4 votes

@Override
public PositionOutputStream create(long blockSizeHint) throws IOException {
  return HadoopStreams.wrap(fs.create(path, false /* do not overwrite */,
      DFS_BUFFER_SIZE_DEFAULT, fs.getDefaultReplication(path),
      Math.max(fs.getDefaultBlockSize(path), blockSizeHint)));
}

Source File: TestColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0

4 votes

PositionOutputStream out() {
  return out;
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

4 votes

@Override
public long nextRowGroupSize(PositionOutputStream out) {
  return rowGroupSize;
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

4 votes

@Override
public void alignForRowGroup(PositionOutputStream out) {
}

Source File: TestColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0

4 votes

@Override
public PositionOutputStream create(long blockSizeHint) throws IOException {
  return out = file.create(blockSizeHint);
}

Source File: TestColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0

4 votes

@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException {
  return out = file.createOrOverwrite(blockSizeHint);
}

Source File: TestDataPageV1Checksums.java From parquet-mr with Apache License 2.0

4 votes

/**
 * Test whether corruption in the page content is detected by checksum verification
 */
@Test
public void testCorruptedPage() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  InputFile inputFile = HadoopInputFile.fromPath(path, conf);
  try (SeekableInputStream inputStream = inputFile.newStream()) {
    int fileLen = (int) inputFile.getLength();
    byte[] fileBytes = new byte[fileLen];
    inputStream.readFully(fileBytes);
    inputStream.close();

    // There are 4 pages in total (2 per column), we corrupt the first page of the first column
    // and the second page of the second column. We do this by altering a byte roughly in the
    // middle of each page to be corrupted
    fileBytes[fileLen / 8]++;
    fileBytes[fileLen / 8 + ((fileLen / 4) * 3)]++;

    OutputFile outputFile = HadoopOutputFile.fromPath(path, conf);
    try (PositionOutputStream outputStream = outputFile.createOrOverwrite(1024 * 1024)) {
      outputStream.write(fileBytes);
      outputStream.close();

      // First we disable checksum verification, the corruption will go undetected as it is in the
      // data section of the page
      conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
      try (ParquetFileReader reader = getParquetFileReader(path, conf,
        Arrays.asList(colADesc, colBDesc))) {
        PageReadStore pageReadStore = reader.readNextRowGroup();

        DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
        assertFalse("Data in page was not corrupted",
          Arrays.equals(colAPage1.getBytes().toByteArray(), colAPage1Bytes));
        readNextPage(colADesc, pageReadStore);
        readNextPage(colBDesc, pageReadStore);
        DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
        assertFalse("Data in page was not corrupted",
          Arrays.equals(colBPage2.getBytes().toByteArray(), colBPage2Bytes));
      }

      // Now we enable checksum verification, the corruption should be detected
      conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
      try (ParquetFileReader reader =
             getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
        // We expect an exception on the first encountered corrupt page (in readAllPages)
        assertVerificationFailed(reader);
      }
    }
  }
}

Source File: NifiParquetOutputFile.java From nifi with Apache License 2.0

4 votes

@Override
public PositionOutputStream create(long blockSizeHint) {
  return new NifiOutputStream(outputStream);
}

Source File: NifiParquetOutputFile.java From nifi with Apache License 2.0

4 votes

@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) {
  return new NifiOutputStream(outputStream);
}

Source File: StreamOutputFile.java From flink with Apache License 2.0

4 votes

@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) {
	return create(blockSizeHint);
}

Source File: ParquetIO.java From beam with Apache License 2.0

4 votes

@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) {
  return new BeamOutputStream(outputStream);
}

Source File: ParquetIO.java From beam with Apache License 2.0

4 votes

@Override
public PositionOutputStream create(long blockSizeHint) {
  return new BeamOutputStream(outputStream);
}

Source File: OutputFile.java From dremio-oss with Apache License 2.0

4 votes

@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException {
  return new PositionOutputStreamWrapper(fs.create(path, true));
}

Source File: OutputFile.java From dremio-oss with Apache License 2.0

4 votes

@Override
public PositionOutputStream create(long blockSizeHint) throws IOException {
  return new PositionOutputStreamWrapper(fs.create(path));
}

Source File: ParquetIO.java From iceberg with Apache License 2.0

4 votes

@Override
public PositionOutputStream createOrOverwrite(long ignored) throws IOException {
  return stream(file.createOrOverwrite());
}

Source File: ParquetIO.java From iceberg with Apache License 2.0

4 votes

@Override
public PositionOutputStream create(long ignored) throws IOException {
  return stream(file.create());
}

org.apache.parquet.io.PositionOutputStream Java Examples