org.apache.parquet.io.SeekableInputStream Java Exaples

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

private DictionaryPage readCompressedDictionary(
    PageHeader pageHeader, SeekableInputStream fin) throws IOException {
  DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header();

  int uncompressedPageSize = pageHeader.getUncompressed_page_size();
  int compressedPageSize = pageHeader.getCompressed_page_size();

  byte [] dictPageBytes = new byte[compressedPageSize];
  fin.readFully(dictPageBytes);

  BytesInput bin = BytesInput.from(dictPageBytes);

  return new DictionaryPage(
      bin, uncompressedPageSize, dictHeader.getNum_values(),
      converter.getEncoding(dictHeader.getEncoding()));
}

Source File: DirectBufInputStream.java From Bats with Apache License 2.0

6 votes

public synchronized int read(DrillBuf buf, int off, int len) throws IOException {
  buf.clear();
  ByteBuffer directBuffer = buf.nioBuffer(0, len);
  int lengthLeftToRead = len;
  SeekableInputStream seekableInputStream = HadoopStreams.wrap(getInputStream());
  while (lengthLeftToRead > 0) {
    if(logger.isTraceEnabled()) {
      logger.trace("PERF: Disk read start. {}, StartOffset: {}, TotalByteSize: {}", this.streamId, this.startOffset, this.totalByteSize);
    }
    Stopwatch timer = Stopwatch.createStarted();
    int bytesRead = seekableInputStream.read(directBuffer);
    if (bytesRead < 0) {
      return bytesRead;
    }
    lengthLeftToRead -= bytesRead;
    if(logger.isTraceEnabled()) {
      logger.trace(
          "PERF: Disk read complete. {}, StartOffset: {}, TotalByteSize: {}, BytesRead: {}, Time: {} ms",
          this.streamId, this.startOffset, this.totalByteSize, bytesRead,
          ((double) timer.elapsed(TimeUnit.MICROSECONDS)) / 1000);
    }
  }
  buf.writerIndex(len);
  return len;
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

/**
 * Copy from a FS input stream to an output stream. Thread-safe
 *
 * @param from a {@link SeekableInputStream}
 * @param to any {@link PositionOutputStream}
 * @param start where in the from stream to start copying
 * @param length the number of bytes to copy
 * @throws IOException if there is an error while reading or writing
 */
private static void copy(SeekableInputStream from, PositionOutputStream to,
                         long start, long length) throws IOException{
  LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos());
  from.seek(start);
  long bytesCopied = 0;
  byte[] buffer = COPY_BUFFER.get();
  while (bytesCopied < length) {
    long bytesLeft = length - bytesCopied;
    int bytesRead = from.read(buffer, 0,
        (buffer.length < bytesLeft ? buffer.length : (int) bytesLeft));
    if (bytesRead < 0) {
      throw new IllegalArgumentException(
          "Unexpected end of input file at " + start + bytesCopied);
    }
    to.write(buffer, 0, bytesRead);
    bytesCopied += bytesRead;
  }
}

Source File: PageReader.java From dremio-oss with Apache License 2.0

6 votes

private void loadDictionaryIfExists(final ColumnReader<?> parentStatus,
    final ColumnChunkMetaData columnChunkMetaData, final SeekableInputStream f) throws IOException {
  Stopwatch timer = Stopwatch.createUnstarted();
  if (columnChunkMetaData.getDictionaryPageOffset() > 0) {
    f.seek(columnChunkMetaData.getDictionaryPageOffset());
    long start=f.getPos();
    timer.start();
    final PageHeader pageHeader = Util.readPageHeader(f);
    long timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
    long pageHeaderBytes=f.getPos()-start;
    this.updateStats(pageHeader, "Page Header", start, timeToRead, pageHeaderBytes, pageHeaderBytes);
    assert pageHeader.type == PageType.DICTIONARY_PAGE;
    assert isDictionaryEncoded(columnChunkMetaData.getEncodings()) :
      format("Missing dictionary encoding for dictionary page %s, in column chunk %s", pageHeader, columnChunkMetaData);
    readDictionaryPage(pageHeader, parentStatus);
  }
}

Source File: PageReader.java From dremio-oss with Apache License 2.0

5 votes

private static SeekableInputStream openFile(FileSystem fs, Path path) throws ExecutionSetupException {
  try {
    return Streams.wrap(fs.open(path));
  } catch (IOException e) {
    throw new ExecutionSetupException("Error opening or reading metadata for parquet file at location: "
      + path.getName(), e);
  }
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException {
  long fileLen = file.getLength();
  LOG.debug("File length {}", fileLen);
  int FOOTER_LENGTH_SIZE = 4;
  if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
    throw new RuntimeException(file.toString() + " is not a Parquet file (too small length: " + fileLen + ")");
  }
  long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length;
  LOG.debug("reading footer index at {}", footerLengthIndex);

  f.seek(footerLengthIndex);
  int footerLength = readIntLittleEndian(f);
  byte[] magic = new byte[MAGIC.length];
  f.readFully(magic);
  if (!Arrays.equals(MAGIC, magic)) {
    throw new RuntimeException(file.toString() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
  }
  long footerIndex = footerLengthIndex - footerLength;
  LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex);
  if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
    throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex);
  }
  f.seek(footerIndex);
  // Read all the footer bytes in one time to avoid multiple read operations,
  // since it can be pretty time consuming for a single read operation in HDFS.
  ByteBuffer footerBytesBuffer = ByteBuffer.allocate(footerLength);
  f.readFully(footerBytesBuffer);
  LOG.debug("Finished to read all footer bytes.");
  footerBytesBuffer.flip();
  InputStream footerBytesStream = ByteBufferInputStream.wrap(footerBytesBuffer);
  return converter.readParquetMetadata(footerBytesStream, options.getMetadataFilter());
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

/**
 * Reads the meta data block in the footer of the file using provided input stream
 * @param file a {@link InputFile} to read
 * @param filter the filter to apply to row groups
 * @return the metadata blocks in the footer
 * @throws IOException if an error occurs while reading the file
 * @deprecated will be removed in 2.0.0;
 *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
 */
@Deprecated
public static final ParquetMetadata readFooter(InputFile file, MetadataFilter filter) throws IOException {
  ParquetReadOptions options;
  if (file instanceof HadoopInputFile) {
    options = HadoopReadOptions.builder(((HadoopInputFile) file).getConfiguration())
        .withMetadataFilter(filter).build();
  } else {
    options = ParquetReadOptions.builder().withMetadataFilter(filter).build();
  }

  try (SeekableInputStream in = file.newStream()) {
    return readFooter(file, options, in);
  }
}

Source File: ParquetIO.java From beam with Apache License 2.0

5 votes

@Override
public SeekableInputStream newStream() {
  return new DelegatingSeekableInputStream(Channels.newInputStream(seekableByteChannel)) {

    @Override
    public long getPos() throws IOException {
      return seekableByteChannel.position();
    }

    @Override
    public void seek(long newPos) throws IOException {
      seekableByteChannel.position(newPos);
    }
  };
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

void add(ChunkDescriptor descriptor, List<ByteBuffer> buffers, SeekableInputStream f) {
  ChunkData data = map.get(descriptor);
  if (data == null) {
    data = new ChunkData();
    map.put(descriptor, data);
  }
  data.buffers.addAll(buffers);

  lastDescriptor = descriptor;
  this.f = f;
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

/**
 * @param f file to read the chunks from
 * @param builder used to build chunk list to read the pages for the different columns
 * @throws IOException if there is an error while reading from the stream
 */
public void readAll(SeekableInputStream f, ChunkListBuilder builder) throws IOException {
  List<Chunk> result = new ArrayList<Chunk>(chunks.size());
  f.seek(offset);

  int fullAllocations = length / options.getMaxAllocationSize();
  int lastAllocationSize = length % options.getMaxAllocationSize();

  int numAllocations = fullAllocations + (lastAllocationSize > 0 ? 1 : 0);
  List<ByteBuffer> buffers = new ArrayList<>(numAllocations);

  for (int i = 0; i < fullAllocations; i += 1) {
    buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize()));
  }

  if (lastAllocationSize > 0) {
    buffers.add(options.getAllocator().allocate(lastAllocationSize));
  }

  for (ByteBuffer buffer : buffers) {
    f.readFully(buffer);
    buffer.flip();
  }

  // report in a counter the data we just scanned
  BenchmarkCounter.incrementBytesRead(length);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffers);
  for (int i = 0; i < chunks.size(); i++) {
    ChunkDescriptor descriptor = chunks.get(i);
    builder.add(descriptor, stream.sliceBuffers(descriptor.size), f);
  }
}

Source File: PageReader.java From dremio-oss with Apache License 2.0

5 votes

PageReader(ColumnReader<?> parentStatus, SeekableInputStream inputStream, Path path, ColumnChunkMetaData columnChunkMetaData) throws ExecutionSetupException {
  this.parentColumnReader = parentStatus;
  allocatedDictionaryBuffers = new ArrayList<>();
  codecFactory = parentColumnReader.parentReader.getCodecFactory();
  this.stats = parentColumnReader.parentReader.parquetReaderStats;
  long start = columnChunkMetaData.getFirstDataPageOffset();
  this.inputStream = inputStream;
  try {
    this.dataReader = new ColumnDataReader(inputStream, start, columnChunkMetaData.getTotalSize());
    loadDictionaryIfExists(parentStatus, columnChunkMetaData, inputStream);
  } catch (IOException e) {
    throw new ExecutionSetupException("Error opening or reading metadata for parquet file at location: "
      + path.getName(), e);
  }
}

Source File: DeprecatedSingleStreamPageReader.java From dremio-oss with Apache License 2.0

5 votes

DeprecatedSingleStreamPageReader(ColumnReader<?> parentStatus, SeekableInputStream inputStream, Path path, ColumnChunkMetaData columnChunkMetaData) throws ExecutionSetupException {
  super(parentStatus, inputStream, path, columnChunkMetaData);
  try {
    lastPosition = inputStream.getPos();
  } catch (IOException e) {
    throw new ExecutionSetupException("Error in getting current position for parquet file at location: " + path, e);
  }
  this.inputStream = inputStream;
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

public void appendRowGroups(SeekableInputStream file,
                            List<BlockMetaData> rowGroups,
                            boolean dropColumns) throws IOException {
  for (BlockMetaData block : rowGroups) {
    appendRowGroup(file, block, dropColumns);
  }
}

Source File: HadoopStreams.java From parquet-mr with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
private static Class<SeekableInputStream> getH2SeekableClass() {
  try {
    return (Class<SeekableInputStream>) Class.forName(
        "org.apache.parquet.hadoop.util.H2SeekableInputStream");
  } catch (ClassNotFoundException | NoClassDefFoundError e) {
    return null;
  }
}

Source File: HadoopStreams.java From parquet-mr with Apache License 2.0

5 votes

private static Constructor<SeekableInputStream> getH2SeekableConstructor() {
  Class<SeekableInputStream> h2SeekableClass = getH2SeekableClass();
  if (h2SeekableClass != null) {
    try {
      return h2SeekableClass.getConstructor(FSDataInputStream.class);
    } catch (NoSuchMethodException e) {
      return null;
    }
  }
  return null;
}

Source File: TestBulkInputStream.java From dremio-oss with Apache License 2.0

4 votes

private void testSeekableStream(SeekableInputStream inputStream) throws IOException {
  int streamPos = 0;
  assertEquals(streamPos, inputStream.getPos());

  // Read some bytes from the start
  final byte[] buf = new byte[1000];
  inputStream.readFully(buf, 0, 88);
  compareData(buf, 0, streamPos, 88);
  streamPos += 88;
  assertEquals(streamPos, inputStream.getPos());

  final byte[] shortBuf = new byte[17];
  inputStream.readFully(shortBuf);
  compareData(shortBuf, 0, streamPos, 17);
  streamPos += 17;
  assertEquals(streamPos, inputStream.getPos());

  // test ByteBuffer interfaces
  final ByteBuffer shortByteBuf = ByteBuffer.allocate(25);
  inputStream.read(shortByteBuf);
  compareData(shortByteBuf.array(), 0, streamPos, 25);
  streamPos += 25;
  assertEquals(streamPos, inputStream.getPos());

  final ByteBuffer shortByteBuf2 = ByteBuffer.allocateDirect(71);
  inputStream.read(shortByteBuf2);
  final ByteBuf compareBuf = Unpooled.directBuffer(100);
  shortByteBuf2.flip();
  compareBuf.writeBytes(shortByteBuf2);
  compareData(compareBuf, streamPos, 71);
  streamPos += 71;
  assertEquals(streamPos, inputStream.getPos());

  final ByteBuffer shortByteBuf3 = ByteBuffer.allocate(66);
  inputStream.readFully(shortByteBuf3);
  compareData(shortByteBuf3.array(), 0, streamPos, 66);
  streamPos += 66;
  assertEquals(streamPos, inputStream.getPos());

  // Test plain old read interface
  buf[0] = (byte) inputStream.read();
  buf[1] = (byte) inputStream.read();
  buf[2] = (byte) inputStream.read();
  compareData(buf, 0, streamPos, 3);
  streamPos += 3;
  assertEquals(streamPos, inputStream.getPos());

  // Skip some, then read
  streamPos += 50;  // skip 50 bytes
  inputStream.seek(streamPos);
  inputStream.readFully(buf, 0, 37);
  compareData(buf, 0, streamPos, 37);
  streamPos += 37;
  assertEquals(streamPos, inputStream.getPos());

  // skip to near the end, then read
  streamPos = TEST_DATA_SIZE - 100;
  inputStream.seek(streamPos);
  inputStream.readFully(buf, 0, 100);
  compareData(buf, 0, streamPos,100);
  streamPos += 100;
  assertEquals(streamPos, inputStream.getPos());
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

4 votes

/**
 * @param descriptor the descriptor of the chunk
 * @param f the file stream positioned at the end of this chunk
 */
private WorkaroundChunk(ChunkDescriptor descriptor, List<ByteBuffer> buffers, SeekableInputStream f, OffsetIndex offsetIndex) {
  super(descriptor, buffers, offsetIndex);
  this.f = f;
}

Source File: HadoopInputFile.java From parquet-mr with Apache License 2.0

4 votes

@Override
public SeekableInputStream newStream() throws IOException {
  return HadoopStreams.wrap(fs.open(stat.getPath()));
}

Source File: TestDataPageV1Checksums.java From parquet-mr with Apache License 2.0

4 votes

/**
 * Test whether corruption in the page content is detected by checksum verification
 */
@Test
public void testCorruptedPage() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  InputFile inputFile = HadoopInputFile.fromPath(path, conf);
  try (SeekableInputStream inputStream = inputFile.newStream()) {
    int fileLen = (int) inputFile.getLength();
    byte[] fileBytes = new byte[fileLen];
    inputStream.readFully(fileBytes);
    inputStream.close();

    // There are 4 pages in total (2 per column), we corrupt the first page of the first column
    // and the second page of the second column. We do this by altering a byte roughly in the
    // middle of each page to be corrupted
    fileBytes[fileLen / 8]++;
    fileBytes[fileLen / 8 + ((fileLen / 4) * 3)]++;

    OutputFile outputFile = HadoopOutputFile.fromPath(path, conf);
    try (PositionOutputStream outputStream = outputFile.createOrOverwrite(1024 * 1024)) {
      outputStream.write(fileBytes);
      outputStream.close();

      // First we disable checksum verification, the corruption will go undetected as it is in the
      // data section of the page
      conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
      try (ParquetFileReader reader = getParquetFileReader(path, conf,
        Arrays.asList(colADesc, colBDesc))) {
        PageReadStore pageReadStore = reader.readNextRowGroup();

        DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
        assertFalse("Data in page was not corrupted",
          Arrays.equals(colAPage1.getBytes().toByteArray(), colAPage1Bytes));
        readNextPage(colADesc, pageReadStore);
        readNextPage(colBDesc, pageReadStore);
        DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
        assertFalse("Data in page was not corrupted",
          Arrays.equals(colBPage2.getBytes().toByteArray(), colBPage2Bytes));
      }

      // Now we enable checksum verification, the corruption should be detected
      conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
      try (ParquetFileReader reader =
             getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
        // We expect an exception on the first encountered corrupt page (in readAllPages)
        assertVerificationFailed(reader);
      }
    }
  }
}

Source File: NifiParquetInputFile.java From nifi with Apache License 2.0

4 votes

@Override
public SeekableInputStream newStream() throws IOException {
    return new NifiSeekableInputStream(input);
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

4 votes

private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f) throws IOException {
  ParquetMetadataConverter converter = new ParquetMetadataConverter(options);
  return readFooter(file, options, f, converter);
}

Source File: ColumnDataReader.java From dremio-oss with Apache License 2.0

4 votes

public SeekableInputStream getInputStream() {
  return input;
}

Source File: ColumnDataReader.java From dremio-oss with Apache License 2.0

4 votes

public ColumnDataReader(SeekableInputStream input, long start, long length) throws IOException{
  this.input = input;
  this.input.seek(start);
  this.endPosition = start + length;
}

Source File: DeprecatedParquetVectorizedReader.java From dremio-oss with Apache License 2.0

4 votes

SeekableInputStream getSingleStream() {
  return singleInputStream;
}

Source File: BulkInputStream.java From dremio-oss with Apache License 2.0

4 votes

public SeekableBulkInputStream(SeekableInputStream is) {
  super();
  this.is = is;
}

Source File: BulkInputStream.java From dremio-oss with Apache License 2.0

4 votes

public static BulkInputStream wrap(SeekableInputStream is) {
  return new SeekableBulkInputStream(is);
}

Source File: VectorAccessibleSerializable.java From dremio-oss with Apache License 2.0

4 votes

public static void readFromStream(SeekableInputStream input, final ArrowBuf outputBuffer, final int bytesToRead) throws IOException{
  final ByteBuffer directBuffer = outputBuffer.nioBuffer(0, bytesToRead);
  input.readFully(directBuffer);
  outputBuffer.writerIndex(bytesToRead);
}

Source File: ParquetIO.java From iceberg with Apache License 2.0

4 votes

@Override
public SeekableInputStream newStream() throws IOException {
  return stream(file.newStream());
}

Source File: ParquetIO.java From iceberg with Apache License 2.0

4 votes

private ParquetInputStreamAdapter(com.netflix.iceberg.io.SeekableInputStream delegate) {
  super(delegate);
  this.delegate = delegate;
}

Source File: ParquetIO.java From iceberg with Apache License 2.0

4 votes

@Override
public SeekableInputStream newStream() throws IOException {
  return stream(file.newStream());
}

org.apache.parquet.io.SeekableInputStream Java Examples