org.apache.parquet.hadoop.util.HadoopStreams Java Examples

The following examples show how to use org.apache.parquet.hadoop.util.HadoopStreams. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DirectBufInputStream.java    From Bats with Apache License 2.0 6 votes vote down vote up
public synchronized int read(DrillBuf buf, int off, int len) throws IOException {
  buf.clear();
  ByteBuffer directBuffer = buf.nioBuffer(0, len);
  int lengthLeftToRead = len;
  SeekableInputStream seekableInputStream = HadoopStreams.wrap(getInputStream());
  while (lengthLeftToRead > 0) {
    if(logger.isTraceEnabled()) {
      logger.trace("PERF: Disk read start. {}, StartOffset: {}, TotalByteSize: {}", this.streamId, this.startOffset, this.totalByteSize);
    }
    Stopwatch timer = Stopwatch.createStarted();
    int bytesRead = seekableInputStream.read(directBuffer);
    if (bytesRead < 0) {
      return bytesRead;
    }
    lengthLeftToRead -= bytesRead;
    if(logger.isTraceEnabled()) {
      logger.trace(
          "PERF: Disk read complete. {}, StartOffset: {}, TotalByteSize: {}, BytesRead: {}, Time: {} ms",
          this.streamId, this.startOffset, this.totalByteSize, bytesRead,
          ((double) timer.elapsed(TimeUnit.MICROSECONDS)) / 1000);
    }
  }
  buf.writerIndex(len);
  return len;
}
 
Example #2
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * FOR TESTING ONLY. This supports testing block padding behavior on the local FS.
 *
 * @param configuration Hadoop configuration
 * @param schema the schema of the data
 * @param file the file to write to
 * @param rowAndBlockSize the row group size
 * @param maxPaddingSize the maximum padding
 * @throws IOException if the file can not be created
 */
ParquetFileWriter(Configuration configuration, MessageType schema,
                  Path file, long rowAndBlockSize, int maxPaddingSize)
    throws IOException {
  FileSystem fs = file.getFileSystem(configuration);
  this.schema = schema;
  this.alignment = PaddingAlignment.get(
      rowAndBlockSize, rowAndBlockSize, maxPaddingSize);
  this.out = HadoopStreams.wrap(
      fs.create(file, true, 8192, fs.getDefaultReplication(file), rowAndBlockSize));
  this.encodingStatsBuilder = new EncodingStats.Builder();
  // no truncation is needed for testing
  this.columnIndexTruncateLength = Integer.MAX_VALUE;
  this.pageWriteChecksumEnabled = ParquetOutputFormat.getPageWriteChecksumEnabled(configuration);
  this.crc = pageWriteChecksumEnabled ? new CRC32() : null;
  this.metadataConverter = new ParquetMetadataConverter(ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
}
 
Example #3
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
private static void writeMetadataFile(Path outputPath, ParquetMetadata metadataFooter, FileSystem fs)
    throws IOException {
  PositionOutputStream metadata = HadoopStreams.wrap(fs.create(outputPath));
  metadata.write(MAGIC);
  serializeFooter(metadataFooter, metadata);
  metadata.close();
}
 
Example #4
Source File: BufferedDirectBufInputStream.java    From Bats with Apache License 2.0 4 votes vote down vote up
/**
 * Read one more block from the underlying stream.
 * Assumes we have reached the end of buffered data
 * Assumes it is being called from a synchronized block.
 * returns number of bytes read or -1 if EOF
 */
private int getNextBlock() throws IOException {
  Preconditions.checkState(this.curPosInBuffer >= this.count,
      "Internal error: Buffered stream has not been consumed and trying to read more from underlying stream");
  checkInputStreamState();
  DrillBuf buffer = getBuf();
  buffer.clear();
  this.count = this.curPosInBuffer = 0;

  if(logger.isTraceEnabled()) {
    logger.trace(
        "PERF: Disk read start. {}, StartOffset: {}, TotalByteSize: {}, BufferSize: {}, Count: {}, " + "CurPosInStream: {}, CurPosInBuffer: {}", this.streamId, this.startOffset,
        this.totalByteSize, this.bufSize, this.count, this.curPosInStream, this.curPosInBuffer);
  }
  Stopwatch timer = Stopwatch.createStarted();
  int bytesToRead = 0;
  // We *cannot* rely on the totalByteSize being correct because
  // metadata for Parquet files is incorrect (sometimes). So we read
  // beyond the totalByteSize parameter. However, to prevent ourselves from reading too
  // much data, we reduce the size of the buffer, down to 64KiB.
  if(enforceTotalByteSize) {
    bytesToRead = (buffer.capacity() >= (totalByteSize + startOffset - curPosInStream)) ?
        (int) (totalByteSize + startOffset - curPosInStream ):
        buffer.capacity();
  } else {
    if (buffer.capacity() >= (totalByteSize + startOffset - curPosInStream)) {
      if (buffer.capacity() > SMALL_BUFFER_SIZE) {
        buffer = this.reallocBuffer(SMALL_BUFFER_SIZE);
      }
    }
    bytesToRead = buffer.capacity();
  }

  ByteBuffer directBuffer = buffer.nioBuffer(curPosInBuffer, bytesToRead);
  // The DFS can return *more* bytes than requested if the capacity of the buffer is greater.
  // i.e 'n' can be greater than bytes requested which is pretty stupid and violates
  // the API contract; but we still have to deal with it. So we make sure the size of the
  // buffer is exactly the same as the number of bytes requested
  int bytesRead = -1;
  int nBytes = 0;
  if (bytesToRead > 0) {
    try {
      nBytes = HadoopStreams.wrap(getInputStream()).read(directBuffer);
    } catch (Exception e) {
      logger.error("Error reading from stream {}. Error was : {}", this.streamId, e.getMessage());
      throw new IOException((e));
    }
    if (nBytes > 0) {
      buffer.writerIndex(nBytes);
      this.count = nBytes + this.curPosInBuffer;
      this.curPosInStream = getInputStream().getPos();
      bytesRead = nBytes;
      if(logger.isTraceEnabled()) {
        logger.trace(
            "PERF: Disk read complete. {}, StartOffset: {}, TotalByteSize: {}, BufferSize: {}, BytesRead: {}, Count: {}, "
                + "CurPosInStream: {}, CurPosInBuffer: {}, Time: {} ms", this.streamId, this.startOffset,
            this.totalByteSize, this.bufSize, bytesRead, this.count, this.curPosInStream, this.curPosInBuffer,
            ((double) timer.elapsed(TimeUnit.MICROSECONDS)) / 1000);
      }
    }
  }
  return this.count - this.curPosInBuffer;
}
 
Example #5
Source File: ColumnDataReader.java    From Bats with Apache License 2.0 4 votes vote down vote up
public void loadPage(DrillBuf target, int pageLength) throws IOException {
  target.clear();
  HadoopStreams.wrap(input).read(target.nioBuffer(0, pageLength));
  target.writerIndex(pageLength);
}
 
Example #6
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 3 votes vote down vote up
/**
 * @param file a file stream to read from
 * @param rowGroups row groups to copy
 * @param dropColumns whether to drop columns from the file that are not in this file's schema
 * @throws IOException if there is an error while reading or writing
 * @deprecated will be removed in 2.0.0;
 *             use {@link #appendRowGroups(SeekableInputStream,List,boolean)} instead
 */
@Deprecated
public void appendRowGroups(FSDataInputStream file,
                            List<BlockMetaData> rowGroups,
                            boolean dropColumns) throws IOException {
  appendRowGroups(HadoopStreams.wrap(file), rowGroups, dropColumns);
}
 
Example #7
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 2 votes vote down vote up
/**
 * @param from a file stream to read from
 * @param rowGroup row group to copy
 * @param dropColumns whether to drop columns from the file that are not in this file's schema
 * @throws IOException if there is an error while reading or writing
 * @deprecated will be removed in 2.0.0;
 *             use {@link #appendRowGroup(SeekableInputStream,BlockMetaData,boolean)} instead
 */
@Deprecated
public void appendRowGroup(FSDataInputStream from, BlockMetaData rowGroup,
                           boolean dropColumns) throws IOException {
  appendRowGroup(HadoopStreams.wrap(from), rowGroup, dropColumns);
}