Java Code Examples for org.apache.parquet.io.PositionOutputStream

The following examples show how to use org.apache.parquet.io.PositionOutputStream. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: parquet-mr   Source File: ParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Copy from a FS input stream to an output stream. Thread-safe
 *
 * @param from a {@link SeekableInputStream}
 * @param to any {@link PositionOutputStream}
 * @param start where in the from stream to start copying
 * @param length the number of bytes to copy
 * @throws IOException if there is an error while reading or writing
 */
private static void copy(SeekableInputStream from, PositionOutputStream to,
                         long start, long length) throws IOException{
  LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos());
  from.seek(start);
  long bytesCopied = 0;
  byte[] buffer = COPY_BUFFER.get();
  while (bytesCopied < length) {
    long bytesLeft = length - bytesCopied;
    int bytesRead = from.read(buffer, 0,
        (buffer.length < bytesLeft ? buffer.length : (int) bytesLeft));
    if (bytesRead < 0) {
      throw new IllegalArgumentException(
          "Unexpected end of input file at " + start + bytesCopied);
    }
    to.write(buffer, 0, bytesRead);
    bytesCopied += bytesRead;
  }
}
 
Example 2
Source Project: parquet-mr   Source File: ParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
private static void serializeColumnIndexes(
    List<List<ColumnIndex>> columnIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: column indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter
          .toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
      if (columnIndex == null) {
        continue;
      }
      long offset = out.getPos();
      Util.writeColumnIndex(columnIndex, out);
      column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}
 
Example 3
Source Project: parquet-mr   Source File: ParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
private static void serializeOffsetIndexes(
    List<List<OffsetIndex>> offsetIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: offset indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<OffsetIndex> blockOffsetIndexes = offsetIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
      if (offsetIndex == null) {
        continue;
      }
      ColumnChunkMetaData column = columns.get(cIndex);
      long offset = out.getPos();
      Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out);
      column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}
 
Example 4
Source Project: parquet-mr   Source File: ParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
private static void serializeBloomFilters(
  List<Map<String, BloomFilter>> bloomFilters,
  List<BlockMetaData> blocks,
  PositionOutputStream out) throws IOException {
  LOG.debug("{}: bloom filters", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
    if (blockBloomFilters.isEmpty()) continue;
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
      if (bloomFilter == null) {
        continue;
      }

      long offset = out.getPos();
      column.setBloomFilterOffset(offset);
      Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out);
      bloomFilter.writeTo(out);
    }
  }
}
 
Example 5
Source Project: Flink-CEPplus   Source File: StreamOutputFile.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) {
	if (used.compareAndSet(false, true)) {
		return new PositionOutputStreamAdapter(stream);
	}
	else {
		throw new IllegalStateException("A stream against this file was already created.");
	}
}
 
Example 6
Source Project: flink   Source File: StreamOutputFile.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) {
	if (used.compareAndSet(false, true)) {
		return new PositionOutputStreamAdapter(stream);
	}
	else {
		throw new IllegalStateException("A stream against this file was already created.");
	}
}
 
Example 7
Source Project: arvo2parquet   Source File: DataLoad.java    License: MIT License 5 votes vote down vote up
private static void writeToParquet(@Nonnull final Schema schema,
                                   @Nonnull final Path fileToWrite,
                                   @Nonnull final GenericDataRecordSink sink) throws IOException
{
  try (final ParquetWriter<GenericData.Record> writer = createParquetWriterInstance(schema, fileToWrite)) {
    //noinspection StatementWithEmptyBody
    do ; while(sink.accept(writer::write));
    writer.close();
    final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE);
    Files.deleteIfExists(metaDataOutPath);
    try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) {
      serializeFooter(writer.getFooter(), out);
    }
  }
}
 
Example 8
Source Project: arvo2parquet   Source File: DataLoad.java    License: MIT License 5 votes vote down vote up
private static void extractMetaDataFooter(final Path parquetFilePath) throws IOException {
  try (final ParquetFileReader rdr = ParquetFileReader.open(nioPathToInputFile(parquetFilePath))) {
    final ParquetMetadata footer = rdr.getFooter();
    final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE + "_dup.parquet");
    Files.deleteIfExists(metaDataOutPath);
    try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) {
      serializeFooter(footer, out);
    }
  }
}
 
Example 9
Source Project: flink   Source File: StreamOutputFile.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) {
	if (used.compareAndSet(false, true)) {
		return new PositionOutputStreamAdapter(stream);
	}
	else {
		throw new IllegalStateException("A stream against this file was already created.");
	}
}
 
Example 10
Source Project: parquet-mr   Source File: ParquetFileWriter.java    License: Apache License 2.0 5 votes vote down vote up
private static void serializeFooter(ParquetMetadata footer, PositionOutputStream out) throws IOException {
  long footerIndex = out.getPos();
  ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter();
  org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(CURRENT_VERSION, footer);
  writeFileMetaData(parquetMetadata, out);
  LOG.debug("{}: footer length = {}" , out.getPos(), (out.getPos() - footerIndex));
  BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex));
  out.write(MAGIC);
}
 
Example 11
Source Project: parquet-mr   Source File: ParquetFileWriter.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
private static void writeMetadataFile(Path outputPath, ParquetMetadata metadataFooter, FileSystem fs)
    throws IOException {
  PositionOutputStream metadata = HadoopStreams.wrap(fs.create(outputPath));
  metadata.write(MAGIC);
  serializeFooter(metadataFooter, metadata);
  metadata.close();
}
 
Example 12
Source Project: parquet-mr   Source File: ParquetFileWriter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void alignForRowGroup(PositionOutputStream out) throws IOException {
  long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize);

  if (isPaddingNeeded(remaining)) {
    LOG.debug("Adding {} bytes of padding (row group size={}B, block size={}B)", remaining, rowGroupSize, dfsBlockSize);
    for (; remaining > 0; remaining -= zeros.length) {
      out.write(zeros, 0, (int) Math.min((long) zeros.length, remaining));
    }
  }
}
 
Example 13
Source Project: parquet-mr   Source File: ParquetFileWriter.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public long nextRowGroupSize(PositionOutputStream out) throws IOException {
  if (maxPaddingSize <= 0) {
    return rowGroupSize;
  }

  long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize);

  if (isPaddingNeeded(remaining)) {
    return rowGroupSize;
  }

  return Math.min(remaining, rowGroupSize);
}
 
Example 14
Source Project: Flink-CEPplus   Source File: StreamOutputFile.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) {
	return create(blockSizeHint);
}
 
Example 15
Source Project: flink   Source File: StreamOutputFile.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) {
	return create(blockSizeHint);
}
 
Example 16
Source Project: iceberg   Source File: ParquetIO.java    License: Apache License 2.0 4 votes vote down vote up
private ParquetOutputStreamAdapter(org.apache.iceberg.io.PositionOutputStream delegate) {
  super(delegate);
  this.delegate = delegate;
}
 
Example 17
Source Project: iceberg   Source File: ParquetIO.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream create(long ignored) throws IOException {
  return stream(file.create());
}
 
Example 18
Source Project: iceberg   Source File: ParquetIO.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long ignored) throws IOException {
  return stream(file.createOrOverwrite());
}
 
Example 19
Source Project: iceberg   Source File: ParquetIO.java    License: Apache License 2.0 4 votes vote down vote up
private ParquetOutputStreamAdapter(com.netflix.iceberg.io.PositionOutputStream delegate) {
  super(delegate);
  this.delegate = delegate;
}
 
Example 20
Source Project: iceberg   Source File: ParquetIO.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream create(long ignored) throws IOException {
  return stream(file.create());
}
 
Example 21
Source Project: iceberg   Source File: ParquetIO.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long ignored) throws IOException {
  return stream(file.createOrOverwrite());
}
 
Example 22
Source Project: dremio-oss   Source File: OutputFile.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) throws IOException {
  return new PositionOutputStreamWrapper(fs.create(path));
}
 
Example 23
Source Project: dremio-oss   Source File: OutputFile.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException {
  return new PositionOutputStreamWrapper(fs.create(path, true));
}
 
Example 24
Source Project: beam   Source File: ParquetIO.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) {
  return new BeamOutputStream(outputStream);
}
 
Example 25
Source Project: beam   Source File: ParquetIO.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) {
  return new BeamOutputStream(outputStream);
}
 
Example 26
Source Project: flink   Source File: StreamOutputFile.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) {
	return create(blockSizeHint);
}
 
Example 27
Source Project: parquet-mr   Source File: ParquetFileWriter.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public void alignForRowGroup(PositionOutputStream out) {
}
 
Example 28
Source Project: parquet-mr   Source File: ParquetFileWriter.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public long nextRowGroupSize(PositionOutputStream out) {
  return rowGroupSize;
}
 
Example 29
Source Project: parquet-mr   Source File: HadoopOutputFile.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream create(long blockSizeHint) throws IOException {
  return HadoopStreams.wrap(fs.create(path, false /* do not overwrite */,
      DFS_BUFFER_SIZE_DEFAULT, fs.getDefaultReplication(path),
      Math.max(fs.getDefaultBlockSize(path), blockSizeHint)));
}
 
Example 30
Source Project: parquet-mr   Source File: HadoopOutputFile.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException {
  return HadoopStreams.wrap(fs.create(path, true /* overwrite if exists */,
      DFS_BUFFER_SIZE_DEFAULT, fs.getDefaultReplication(path),
      Math.max(fs.getDefaultBlockSize(path), blockSizeHint)));
}