org.apache.parquet.HadoopReadOptions Java Examples

The following examples show how to use org.apache.parquet.HadoopReadOptions. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TransCompressionCommand.java    From parquet-mr with Apache License 2.0 7 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(input != null && output != null,
    "Both input and output parquet file paths are required.");

  Preconditions.checkArgument(codec != null,
    "The codec cannot be null");

  Path inPath = new Path(input);
  Path outPath = new Path(output);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
  return 0;
}
 
Example #2
Source File: TransCompressionCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);
  List<String> args = options.getArgList();
  Path inPath = new Path(args.get(0));
  Path outPath = new Path(args.get(1));
  CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2));

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}
 
Example #3
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void convertCompression(Configuration conf, String inputFile, String outputFile, String codec) throws IOException {
  Path inPath = new Path(inputFile);
  Path outPath = new Path(outputFile);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}
 
Example #4
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param conf the Hadoop Configuration
 * @param file Path to a parquet file
 * @param footer a {@link ParquetMetadata} footer already read from the file
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(Configuration conf, Path file, ParquetMetadata footer) throws IOException {
  this.converter = new ParquetMetadataConverter(conf);
  this.file = HadoopInputFile.fromPath(file, conf);
  this.f = this.file.newStream();
  this.options = HadoopReadOptions.builder(conf).build();
  this.footer = footer;
  this.fileMetaData = footer.getFileMetaData();
  this.blocks = filterRowGroups(footer.getBlocks());
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}
 
Example #5
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param configuration the Hadoop conf
 * @param fileMetaData fileMetaData for parquet file
 * @param filePath Path for the parquet file
 * @param blocks the blocks to read
 * @param columns the columns to read (their path)
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(
    Configuration configuration, FileMetaData fileMetaData,
    Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException {
  this.converter = new ParquetMetadataConverter(configuration);
  this.file = HadoopInputFile.fromPath(filePath, configuration);
  this.fileMetaData = fileMetaData;
  this.f = file.newStream();
  this.options = HadoopReadOptions.builder(configuration).build();
  this.blocks = filterRowGroups(blocks);
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : columns) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}
 
Example #6
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Deprecated
protected Builder(Path path) {
  this.readSupport = null;
  this.file = null;
  this.path = Objects.requireNonNull(path, "path cannot be null");
  this.conf = new Configuration();
  this.optionsBuilder = HadoopReadOptions.builder(conf);
}
 
Example #7
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
protected Builder(InputFile file) {
  this.readSupport = null;
  this.file = Objects.requireNonNull(file, "file cannot be null");
  this.path = null;
  if (file instanceof HadoopInputFile) {
    this.conf = ((HadoopInputFile) file).getConfiguration();
  } else {
    this.conf = new Configuration();
  }
  optionsBuilder = HadoopReadOptions.builder(conf);
}
 
Example #8
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public Builder<T> withConf(Configuration conf) {
  this.conf = Objects.requireNonNull(conf, "conf cannot be null");

  // previous versions didn't use the builder, so may set filter before conf. this maintains
  // compatibility for filter. other options are reset by a new conf.
  this.optionsBuilder = HadoopReadOptions.builder(conf);
  if (filter != null) {
    optionsBuilder.withRecordFilter(filter);
  }

  return this;
}
 
Example #9
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Reads the meta data block in the footer of the file using provided input stream
 * @param file a {@link InputFile} to read
 * @param filter the filter to apply to row groups
 * @return the metadata blocks in the footer
 * @throws IOException if an error occurs while reading the file
 * @deprecated will be removed in 2.0.0;
 *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
 */
@Deprecated
public static final ParquetMetadata readFooter(InputFile file, MetadataFilter filter) throws IOException {
  ParquetReadOptions options;
  if (file instanceof HadoopInputFile) {
    options = HadoopReadOptions.builder(((HadoopInputFile) file).getConfiguration())
        .withMetadataFilter(filter).build();
  } else {
    options = ParquetReadOptions.builder().withMetadataFilter(filter).build();
  }

  try (SeekableInputStream in = file.newStream()) {
    return readFooter(file, options, in);
  }
}
 
Example #10
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Deprecated
private Builder(ReadSupport<T> readSupport, Path path) {
  this.readSupport = Objects.requireNonNull(readSupport, "readSupport cannot be null");
  this.file = null;
  this.path = Objects.requireNonNull(path, "path cannot be null");
  this.conf = new Configuration();
  this.optionsBuilder = HadoopReadOptions.builder(conf);
}
 
Example #11
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      FilterCompat.Filter filter) throws IOException {
  this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)),
      HadoopReadOptions.builder(conf)
          .withRecordFilter(Objects.requireNonNull(filter, "filter cannot be null"))
          .build(),
      readSupport);
}
 
Example #12
Source File: ParquetRecordReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void initializeInternalReader(ParquetInputSplit split, Configuration configuration) throws IOException {
  Path path = split.getPath();
  long[] rowGroupOffsets = split.getRowGroupOffsets();

  // if task.side.metadata is set, rowGroupOffsets is null
  ParquetReadOptions.Builder optionsBuilder = HadoopReadOptions.builder(configuration);
  if (rowGroupOffsets != null) {
    optionsBuilder.withOffsets(rowGroupOffsets);
  } else {
    optionsBuilder.withRange(split.getStart(), split.getEnd());
  }

  // open a reader with the metadata filter
  ParquetFileReader reader = ParquetFileReader.open(
      HadoopInputFile.fromPath(path, configuration), optionsBuilder.build());

  if (rowGroupOffsets != null) {
    // verify a row group was found for each offset
    List<BlockMetaData> blocks = reader.getFooter().getBlocks();
    if (blocks.size() != rowGroupOffsets.length) {
      throw new IllegalStateException(
          "All of the offsets in the split should be found in the file."
          + " expected: " + Arrays.toString(rowGroupOffsets)
          + " found: " + blocks);
    }
  }

  if (!reader.getRowGroups().isEmpty()) {
    checkDeltaByteArrayProblem(
        reader.getFooter().getFileMetaData(), configuration,
        reader.getRowGroups().get(0));
  }

  internalReader.initialize(reader, configuration);
}
 
Example #13
Source File: InternalParquetRecordReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
  // copy custom configuration to the Configuration passed to the ReadSupport
  Configuration conf = new Configuration();
  if (options instanceof HadoopReadOptions) {
    conf = ((HadoopReadOptions) options).getConf();
  }
  for (String property : options.getPropertyNames()) {
    conf.set(property, options.getProperty(property));
  }

  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
  this.filterRecords = options.useRecordFilter();
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}
 
Example #14
Source File: ParquetFileAccessor.java    From pxf with Apache License 2.0 5 votes vote down vote up
/**
 * Reads the original schema from the parquet file.
 *
 * @param parquetFile the path to the parquet file
 * @param fileSplit   the file split we are accessing
 * @return the original schema from the parquet file
 * @throws IOException when there's an IOException while reading the schema
 */
private MessageType getSchema(Path parquetFile, FileSplit fileSplit) throws IOException {

    final long then = System.nanoTime();
    ParquetMetadataConverter.MetadataFilter filter = ParquetMetadataConverter.range(
            fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength());
    ParquetReadOptions parquetReadOptions = HadoopReadOptions
            .builder(configuration)
            .withMetadataFilter(filter)
            .build();
    HadoopInputFile inputFile = HadoopInputFile.fromPath(parquetFile, configuration);
    try (ParquetFileReader parquetFileReader =
                 ParquetFileReader.open(inputFile, parquetReadOptions)) {
        FileMetaData metadata = parquetFileReader.getFileMetaData();
        if (LOG.isDebugEnabled()) {
            LOG.debug("{}-{}: Reading file {} with {} records in {} RowGroups",
                    context.getTransactionId(), context.getSegmentId(),
                    parquetFile.getName(), parquetFileReader.getRecordCount(),
                    parquetFileReader.getRowGroups().size());
        }
        final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - then);
        LOG.debug("{}-{}: Read schema in {} ms", context.getTransactionId(),
                context.getSegmentId(), millis);
        return metadata.getSchema();
    } catch (Exception e) {
        throw new IOException(e);
    }
}
 
Example #15
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void validColumnIndex(String inputFile, String outFile) throws Exception {
  ParquetMetadata inMetaData = ParquetFileReader.readFooter(conf, new Path(inputFile), NO_FILTER);
  ParquetMetadata outMetaData = ParquetFileReader.readFooter(conf, new Path(outFile), NO_FILTER);
  Assert.assertEquals(inMetaData.getBlocks().size(), outMetaData.getBlocks().size());
  try (TransParquetFileReader inReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(inputFile), conf), HadoopReadOptions.builder(conf).build());
       TransParquetFileReader outReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(outFile), conf), HadoopReadOptions.builder(conf).build())) {
    for (int i = 0; i < inMetaData.getBlocks().size(); i++) {
      BlockMetaData inBlockMetaData = inMetaData.getBlocks().get(i);
      BlockMetaData outBlockMetaData = outMetaData.getBlocks().get(i);
      Assert.assertEquals(inBlockMetaData.getColumns().size(), outBlockMetaData.getColumns().size());
      for (int j = 0; j < inBlockMetaData.getColumns().size(); j++) {
        ColumnChunkMetaData inChunk = inBlockMetaData.getColumns().get(j);
        ColumnIndex inColumnIndex = inReader.readColumnIndex(inChunk);
        OffsetIndex inOffsetIndex = inReader.readOffsetIndex(inChunk);
        ColumnChunkMetaData outChunk = outBlockMetaData.getColumns().get(j);
        ColumnIndex outColumnIndex = outReader.readColumnIndex(outChunk);
        OffsetIndex outOffsetIndex = outReader.readOffsetIndex(outChunk);
        if (inColumnIndex != null) {
          Assert.assertEquals(inColumnIndex.getBoundaryOrder(), outColumnIndex.getBoundaryOrder());
          Assert.assertEquals(inColumnIndex.getMaxValues(), outColumnIndex.getMaxValues());
          Assert.assertEquals(inColumnIndex.getMinValues(), outColumnIndex.getMinValues());
          Assert.assertEquals(inColumnIndex.getNullCounts(), outColumnIndex.getNullCounts());
        }
        if (inOffsetIndex != null) {
          List<Long> inOffsets = getOffsets(inReader, inChunk);
          List<Long> outOffsets = getOffsets(outReader, outChunk);
          Assert.assertEquals(inOffsets.size(), outOffsets.size());
          Assert.assertEquals(inOffsets.size(), inOffsetIndex.getPageCount());
          Assert.assertEquals(inOffsetIndex.getPageCount(), outOffsetIndex.getPageCount());
          for (int k = 0; k < inOffsetIndex.getPageCount(); k++) {
            Assert.assertEquals(inOffsetIndex.getFirstRowIndex(k), outOffsetIndex.getFirstRowIndex(k));
            Assert.assertEquals(inOffsetIndex.getLastRowIndex(k, inChunk.getValueCount()),
              outOffsetIndex.getLastRowIndex(k, outChunk.getValueCount()));
            Assert.assertEquals(inOffsetIndex.getOffset(k), (long)inOffsets.get(k));
            Assert.assertEquals(outOffsetIndex.getOffset(k), (long)outOffsets.get(k));
          }
        }
      }
    }
  }
}
 
Example #16
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 2 votes vote down vote up
/**
 * @param conf a configuration
 * @param file a file path to open
 * @return a parquet file reader
 * @throws IOException if there is an error while opening the file
 * @deprecated will be removed in 2.0.0; use {@link #open(InputFile)}
 */
@Deprecated
public static ParquetFileReader open(Configuration conf, Path file) throws IOException {
  return new ParquetFileReader(HadoopInputFile.fromPath(file, conf),
      HadoopReadOptions.builder(conf).build());
}
 
Example #17
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 2 votes vote down vote up
/**
 * @param conf a configuration
 * @param file a file path to open
 * @param filter a metadata filter
 * @return a parquet file reader
 * @throws IOException if there is an error while opening the file
 * @deprecated will be removed in 2.0.0; use {@link #open(InputFile,ParquetReadOptions)}
 */
@Deprecated
public static ParquetFileReader open(Configuration conf, Path file, MetadataFilter filter) throws IOException {
  return open(HadoopInputFile.fromPath(file, conf),
      HadoopReadOptions.builder(conf).withMetadataFilter(filter).build());
}
 
Example #18
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 2 votes vote down vote up
/**
 * @param conf the Hadoop Configuration
 * @param file Path to a parquet file
 * @param filter a {@link MetadataFilter} for selecting row groups
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(Configuration conf, Path file, MetadataFilter filter) throws IOException {
  this(HadoopInputFile.fromPath(file, conf),
      HadoopReadOptions.builder(conf).withMetadataFilter(filter).build());
}