org.apache.parquet.io.PositionOutputStream Java Examples
The following examples show how to use
org.apache.parquet.io.PositionOutputStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
private static void serializeBloomFilters( List<Map<String, BloomFilter>> bloomFilters, List<BlockMetaData> blocks, PositionOutputStream out) throws IOException { LOG.debug("{}: bloom filters", out.getPos()); for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns(); Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex); if (blockBloomFilters.isEmpty()) continue; for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) { ColumnChunkMetaData column = columns.get(cIndex); BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString()); if (bloomFilter == null) { continue; } long offset = out.getPos(); column.setBloomFilterOffset(offset); Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out); bloomFilter.writeTo(out); } } }
Example #2
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
private static void serializeOffsetIndexes( List<List<OffsetIndex>> offsetIndexes, List<BlockMetaData> blocks, PositionOutputStream out) throws IOException { LOG.debug("{}: offset indexes", out.getPos()); for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns(); List<OffsetIndex> blockOffsetIndexes = offsetIndexes.get(bIndex); for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) { OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex); if (offsetIndex == null) { continue; } ColumnChunkMetaData column = columns.get(cIndex); long offset = out.getPos(); Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out); column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset))); } } }
Example #3
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
private static void serializeColumnIndexes( List<List<ColumnIndex>> columnIndexes, List<BlockMetaData> blocks, PositionOutputStream out) throws IOException { LOG.debug("{}: column indexes", out.getPos()); for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) { List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns(); List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex); for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) { ColumnChunkMetaData column = columns.get(cIndex); org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter .toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex)); if (columnIndex == null) { continue; } long offset = out.getPos(); Util.writeColumnIndex(columnIndex, out); column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset))); } } }
Example #4
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * Copy from a FS input stream to an output stream. Thread-safe * * @param from a {@link SeekableInputStream} * @param to any {@link PositionOutputStream} * @param start where in the from stream to start copying * @param length the number of bytes to copy * @throws IOException if there is an error while reading or writing */ private static void copy(SeekableInputStream from, PositionOutputStream to, long start, long length) throws IOException{ LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos()); from.seek(start); long bytesCopied = 0; byte[] buffer = COPY_BUFFER.get(); while (bytesCopied < length) { long bytesLeft = length - bytesCopied; int bytesRead = from.read(buffer, 0, (buffer.length < bytesLeft ? buffer.length : (int) bytesLeft)); if (bytesRead < 0) { throw new IllegalArgumentException( "Unexpected end of input file at " + start + bytesCopied); } to.write(buffer, 0, bytesRead); bytesCopied += bytesRead; } }
Example #5
Source File: StreamOutputFile.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
@Override public PositionOutputStream create(long blockSizeHint) { if (used.compareAndSet(false, true)) { return new PositionOutputStreamAdapter(stream); } else { throw new IllegalStateException("A stream against this file was already created."); } }
Example #6
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
private static void serializeFooter(ParquetMetadata footer, PositionOutputStream out) throws IOException { long footerIndex = out.getPos(); ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter(); org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(CURRENT_VERSION, footer); writeFileMetaData(parquetMetadata, out); LOG.debug("{}: footer length = {}" , out.getPos(), (out.getPos() - footerIndex)); BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex)); out.write(MAGIC); }
Example #7
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
/** * @deprecated metadata files are not recommended and will be removed in 2.0.0 */ @Deprecated private static void writeMetadataFile(Path outputPath, ParquetMetadata metadataFooter, FileSystem fs) throws IOException { PositionOutputStream metadata = HadoopStreams.wrap(fs.create(outputPath)); metadata.write(MAGIC); serializeFooter(metadataFooter, metadata); metadata.close(); }
Example #8
Source File: StreamOutputFile.java From flink with Apache License 2.0 | 5 votes |
@Override public PositionOutputStream create(long blockSizeHint) { if (used.compareAndSet(false, true)) { return new PositionOutputStreamAdapter(stream); } else { throw new IllegalStateException("A stream against this file was already created."); } }
Example #9
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void alignForRowGroup(PositionOutputStream out) throws IOException { long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize); if (isPaddingNeeded(remaining)) { LOG.debug("Adding {} bytes of padding (row group size={}B, block size={}B)", remaining, rowGroupSize, dfsBlockSize); for (; remaining > 0; remaining -= zeros.length) { out.write(zeros, 0, (int) Math.min((long) zeros.length, remaining)); } } }
Example #10
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public long nextRowGroupSize(PositionOutputStream out) throws IOException { if (maxPaddingSize <= 0) { return rowGroupSize; } long remaining = dfsBlockSize - (out.getPos() % dfsBlockSize); if (isPaddingNeeded(remaining)) { return rowGroupSize; } return Math.min(remaining, rowGroupSize); }
Example #11
Source File: DataLoad.java From arvo2parquet with MIT License | 5 votes |
private static void extractMetaDataFooter(final Path parquetFilePath) throws IOException { try (final ParquetFileReader rdr = ParquetFileReader.open(nioPathToInputFile(parquetFilePath))) { final ParquetMetadata footer = rdr.getFooter(); final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE + "_dup.parquet"); Files.deleteIfExists(metaDataOutPath); try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) { serializeFooter(footer, out); } } }
Example #12
Source File: DataLoad.java From arvo2parquet with MIT License | 5 votes |
private static void writeToParquet(@Nonnull final Schema schema, @Nonnull final Path fileToWrite, @Nonnull final GenericDataRecordSink sink) throws IOException { try (final ParquetWriter<GenericData.Record> writer = createParquetWriterInstance(schema, fileToWrite)) { //noinspection StatementWithEmptyBody do ; while(sink.accept(writer::write)); writer.close(); final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE); Files.deleteIfExists(metaDataOutPath); try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) { serializeFooter(writer.getFooter(), out); } } }
Example #13
Source File: StreamOutputFile.java From flink with Apache License 2.0 | 5 votes |
@Override public PositionOutputStream create(long blockSizeHint) { if (used.compareAndSet(false, true)) { return new PositionOutputStreamAdapter(stream); } else { throw new IllegalStateException("A stream against this file was already created."); } }
Example #14
Source File: HadoopOutputFile.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException { return HadoopStreams.wrap(fs.create(path, true /* overwrite if exists */, DFS_BUFFER_SIZE_DEFAULT, fs.getDefaultReplication(path), Math.max(fs.getDefaultBlockSize(path), blockSizeHint))); }
Example #15
Source File: HadoopOutputFile.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public PositionOutputStream create(long blockSizeHint) throws IOException { return HadoopStreams.wrap(fs.create(path, false /* do not overwrite */, DFS_BUFFER_SIZE_DEFAULT, fs.getDefaultReplication(path), Math.max(fs.getDefaultBlockSize(path), blockSizeHint))); }
Example #16
Source File: TestColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0 | 4 votes |
PositionOutputStream out() { return out; }
Example #17
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public long nextRowGroupSize(PositionOutputStream out) { return rowGroupSize; }
Example #18
Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public void alignForRowGroup(PositionOutputStream out) { }
Example #19
Source File: TestColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public PositionOutputStream create(long blockSizeHint) throws IOException { return out = file.create(blockSizeHint); }
Example #20
Source File: TestColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException { return out = file.createOrOverwrite(blockSizeHint); }
Example #21
Source File: TestDataPageV1Checksums.java From parquet-mr with Apache License 2.0 | 4 votes |
/** * Test whether corruption in the page content is detected by checksum verification */ @Test public void testCorruptedPage() throws IOException { Configuration conf = new Configuration(); conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true); Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED); InputFile inputFile = HadoopInputFile.fromPath(path, conf); try (SeekableInputStream inputStream = inputFile.newStream()) { int fileLen = (int) inputFile.getLength(); byte[] fileBytes = new byte[fileLen]; inputStream.readFully(fileBytes); inputStream.close(); // There are 4 pages in total (2 per column), we corrupt the first page of the first column // and the second page of the second column. We do this by altering a byte roughly in the // middle of each page to be corrupted fileBytes[fileLen / 8]++; fileBytes[fileLen / 8 + ((fileLen / 4) * 3)]++; OutputFile outputFile = HadoopOutputFile.fromPath(path, conf); try (PositionOutputStream outputStream = outputFile.createOrOverwrite(1024 * 1024)) { outputStream.write(fileBytes); outputStream.close(); // First we disable checksum verification, the corruption will go undetected as it is in the // data section of the page conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false); try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) { PageReadStore pageReadStore = reader.readNextRowGroup(); DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore); assertFalse("Data in page was not corrupted", Arrays.equals(colAPage1.getBytes().toByteArray(), colAPage1Bytes)); readNextPage(colADesc, pageReadStore); readNextPage(colBDesc, pageReadStore); DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore); assertFalse("Data in page was not corrupted", Arrays.equals(colBPage2.getBytes().toByteArray(), colBPage2Bytes)); } // Now we enable checksum verification, the corruption should be detected conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true); try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) { // We expect an exception on the first encountered corrupt page (in readAllPages) assertVerificationFailed(reader); } } } }
Example #22
Source File: NifiParquetOutputFile.java From nifi with Apache License 2.0 | 4 votes |
@Override public PositionOutputStream create(long blockSizeHint) { return new NifiOutputStream(outputStream); }
Example #23
Source File: NifiParquetOutputFile.java From nifi with Apache License 2.0 | 4 votes |
@Override public PositionOutputStream createOrOverwrite(long blockSizeHint) { return new NifiOutputStream(outputStream); }
Example #24
Source File: StreamOutputFile.java From flink with Apache License 2.0 | 4 votes |
@Override public PositionOutputStream createOrOverwrite(long blockSizeHint) { return create(blockSizeHint); }
Example #25
Source File: ParquetIO.java From beam with Apache License 2.0 | 4 votes |
@Override public PositionOutputStream createOrOverwrite(long blockSizeHint) { return new BeamOutputStream(outputStream); }
Example #26
Source File: ParquetIO.java From beam with Apache License 2.0 | 4 votes |
@Override public PositionOutputStream create(long blockSizeHint) { return new BeamOutputStream(outputStream); }
Example #27
Source File: OutputFile.java From dremio-oss with Apache License 2.0 | 4 votes |
@Override public PositionOutputStream createOrOverwrite(long blockSizeHint) throws IOException { return new PositionOutputStreamWrapper(fs.create(path, true)); }
Example #28
Source File: OutputFile.java From dremio-oss with Apache License 2.0 | 4 votes |
@Override public PositionOutputStream create(long blockSizeHint) throws IOException { return new PositionOutputStreamWrapper(fs.create(path)); }
Example #29
Source File: ParquetIO.java From iceberg with Apache License 2.0 | 4 votes |
@Override public PositionOutputStream createOrOverwrite(long ignored) throws IOException { return stream(file.createOrOverwrite()); }
Example #30
Source File: ParquetIO.java From iceberg with Apache License 2.0 | 4 votes |
@Override public PositionOutputStream create(long ignored) throws IOException { return stream(file.create()); }