org.apache.parquet.HadoopReadOptions Java Exaples

Source File: TransCompressionCommand.java From parquet-mr with Apache License 2.0

7 votes

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(input != null && output != null,
    "Both input and output parquet file paths are required.");

  Preconditions.checkArgument(codec != null,
    "The codec cannot be null");

  Path inPath = new Path(input);
  Path outPath = new Path(output);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
  return 0;
}

Source File: TransCompressionCommand.java From parquet-mr with Apache License 2.0

6 votes

@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);
  List<String> args = options.getArgList();
  Path inPath = new Path(args.get(0));
  Path outPath = new Path(args.get(1));
  CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2));

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}

Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0

6 votes

private void convertCompression(Configuration conf, String inputFile, String outputFile, String codec) throws IOException {
  Path inPath = new Path(inputFile);
  Path outPath = new Path(outputFile);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param conf the Hadoop Configuration
 * @param file Path to a parquet file
 * @param footer a {@link ParquetMetadata} footer already read from the file
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(Configuration conf, Path file, ParquetMetadata footer) throws IOException {
  this.converter = new ParquetMetadataConverter(conf);
  this.file = HadoopInputFile.fromPath(file, conf);
  this.f = this.file.newStream();
  this.options = HadoopReadOptions.builder(conf).build();
  this.footer = footer;
  this.fileMetaData = footer.getFileMetaData();
  this.blocks = filterRowGroups(footer.getBlocks());
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param configuration the Hadoop conf
 * @param fileMetaData fileMetaData for parquet file
 * @param filePath Path for the parquet file
 * @param blocks the blocks to read
 * @param columns the columns to read (their path)
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(
    Configuration configuration, FileMetaData fileMetaData,
    Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException {
  this.converter = new ParquetMetadataConverter(configuration);
  this.file = HadoopInputFile.fromPath(filePath, configuration);
  this.fileMetaData = fileMetaData;
  this.f = file.newStream();
  this.options = HadoopReadOptions.builder(configuration).build();
  this.blocks = filterRowGroups(blocks);
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : columns) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}

Source File: ParquetReader.java From parquet-mr with Apache License 2.0

5 votes

@Deprecated
protected Builder(Path path) {
  this.readSupport = null;
  this.file = null;
  this.path = Objects.requireNonNull(path, "path cannot be null");
  this.conf = new Configuration();
  this.optionsBuilder = HadoopReadOptions.builder(conf);
}

Source File: ParquetReader.java From parquet-mr with Apache License 2.0

5 votes

protected Builder(InputFile file) {
  this.readSupport = null;
  this.file = Objects.requireNonNull(file, "file cannot be null");
  this.path = null;
  if (file instanceof HadoopInputFile) {
    this.conf = ((HadoopInputFile) file).getConfiguration();
  } else {
    this.conf = new Configuration();
  }
  optionsBuilder = HadoopReadOptions.builder(conf);
}

Source File: ParquetReader.java From parquet-mr with Apache License 2.0

5 votes

public Builder<T> withConf(Configuration conf) {
  this.conf = Objects.requireNonNull(conf, "conf cannot be null");

  // previous versions didn't use the builder, so may set filter before conf. this maintains
  // compatibility for filter. other options are reset by a new conf.
  this.optionsBuilder = HadoopReadOptions.builder(conf);
  if (filter != null) {
    optionsBuilder.withRecordFilter(filter);
  }

  return this;
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

/**
 * Reads the meta data block in the footer of the file using provided input stream
 * @param file a {@link InputFile} to read
 * @param filter the filter to apply to row groups
 * @return the metadata blocks in the footer
 * @throws IOException if an error occurs while reading the file
 * @deprecated will be removed in 2.0.0;
 *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
 */
@Deprecated
public static final ParquetMetadata readFooter(InputFile file, MetadataFilter filter) throws IOException {
  ParquetReadOptions options;
  if (file instanceof HadoopInputFile) {
    options = HadoopReadOptions.builder(((HadoopInputFile) file).getConfiguration())
        .withMetadataFilter(filter).build();
  } else {
    options = ParquetReadOptions.builder().withMetadataFilter(filter).build();
  }

  try (SeekableInputStream in = file.newStream()) {
    return readFooter(file, options, in);
  }
}

Source File: ParquetReader.java From parquet-mr with Apache License 2.0

5 votes

@Deprecated
private Builder(ReadSupport<T> readSupport, Path path) {
  this.readSupport = Objects.requireNonNull(readSupport, "readSupport cannot be null");
  this.file = null;
  this.path = Objects.requireNonNull(path, "path cannot be null");
  this.conf = new Configuration();
  this.optionsBuilder = HadoopReadOptions.builder(conf);
}

Source File: ParquetReader.java From parquet-mr with Apache License 2.0

5 votes

private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      FilterCompat.Filter filter) throws IOException {
  this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)),
      HadoopReadOptions.builder(conf)
          .withRecordFilter(Objects.requireNonNull(filter, "filter cannot be null"))
          .build(),
      readSupport);
}

Source File: ParquetRecordReader.java From parquet-mr with Apache License 2.0

5 votes

private void initializeInternalReader(ParquetInputSplit split, Configuration configuration) throws IOException {
  Path path = split.getPath();
  long[] rowGroupOffsets = split.getRowGroupOffsets();

  // if task.side.metadata is set, rowGroupOffsets is null
  ParquetReadOptions.Builder optionsBuilder = HadoopReadOptions.builder(configuration);
  if (rowGroupOffsets != null) {
    optionsBuilder.withOffsets(rowGroupOffsets);
  } else {
    optionsBuilder.withRange(split.getStart(), split.getEnd());
  }

  // open a reader with the metadata filter
  ParquetFileReader reader = ParquetFileReader.open(
      HadoopInputFile.fromPath(path, configuration), optionsBuilder.build());

  if (rowGroupOffsets != null) {
    // verify a row group was found for each offset
    List<BlockMetaData> blocks = reader.getFooter().getBlocks();
    if (blocks.size() != rowGroupOffsets.length) {
      throw new IllegalStateException(
          "All of the offsets in the split should be found in the file."
          + " expected: " + Arrays.toString(rowGroupOffsets)
          + " found: " + blocks);
    }
  }

  if (!reader.getRowGroups().isEmpty()) {
    checkDeltaByteArrayProblem(
        reader.getFooter().getFileMetaData(), configuration,
        reader.getRowGroups().get(0));
  }

  internalReader.initialize(reader, configuration);
}

Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0

5 votes

public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
  // copy custom configuration to the Configuration passed to the ReadSupport
  Configuration conf = new Configuration();
  if (options instanceof HadoopReadOptions) {
    conf = ((HadoopReadOptions) options).getConf();
  }
  for (String property : options.getPropertyNames()) {
    conf.set(property, options.getProperty(property));
  }

  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
  this.filterRecords = options.useRecordFilter();
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}

Source File: ParquetFileAccessor.java From pxf with Apache License 2.0

5 votes

/**
 * Reads the original schema from the parquet file.
 *
 * @param parquetFile the path to the parquet file
 * @param fileSplit   the file split we are accessing
 * @return the original schema from the parquet file
 * @throws IOException when there's an IOException while reading the schema
 */
private MessageType getSchema(Path parquetFile, FileSplit fileSplit) throws IOException {

    final long then = System.nanoTime();
    ParquetMetadataConverter.MetadataFilter filter = ParquetMetadataConverter.range(
            fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength());
    ParquetReadOptions parquetReadOptions = HadoopReadOptions
            .builder(configuration)
            .withMetadataFilter(filter)
            .build();
    HadoopInputFile inputFile = HadoopInputFile.fromPath(parquetFile, configuration);
    try (ParquetFileReader parquetFileReader =
                 ParquetFileReader.open(inputFile, parquetReadOptions)) {
        FileMetaData metadata = parquetFileReader.getFileMetaData();
        if (LOG.isDebugEnabled()) {
            LOG.debug("{}-{}: Reading file {} with {} records in {} RowGroups",
                    context.getTransactionId(), context.getSegmentId(),
                    parquetFile.getName(), parquetFileReader.getRecordCount(),
                    parquetFileReader.getRowGroups().size());
        }
        final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - then);
        LOG.debug("{}-{}: Read schema in {} ms", context.getTransactionId(),
                context.getSegmentId(), millis);
        return metadata.getSchema();
    } catch (Exception e) {
        throw new IOException(e);
    }
}

Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0

4 votes

private void validColumnIndex(String inputFile, String outFile) throws Exception {
  ParquetMetadata inMetaData = ParquetFileReader.readFooter(conf, new Path(inputFile), NO_FILTER);
  ParquetMetadata outMetaData = ParquetFileReader.readFooter(conf, new Path(outFile), NO_FILTER);
  Assert.assertEquals(inMetaData.getBlocks().size(), outMetaData.getBlocks().size());
  try (TransParquetFileReader inReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(inputFile), conf), HadoopReadOptions.builder(conf).build());
       TransParquetFileReader outReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(outFile), conf), HadoopReadOptions.builder(conf).build())) {
    for (int i = 0; i < inMetaData.getBlocks().size(); i++) {
      BlockMetaData inBlockMetaData = inMetaData.getBlocks().get(i);
      BlockMetaData outBlockMetaData = outMetaData.getBlocks().get(i);
      Assert.assertEquals(inBlockMetaData.getColumns().size(), outBlockMetaData.getColumns().size());
      for (int j = 0; j < inBlockMetaData.getColumns().size(); j++) {
        ColumnChunkMetaData inChunk = inBlockMetaData.getColumns().get(j);
        ColumnIndex inColumnIndex = inReader.readColumnIndex(inChunk);
        OffsetIndex inOffsetIndex = inReader.readOffsetIndex(inChunk);
        ColumnChunkMetaData outChunk = outBlockMetaData.getColumns().get(j);
        ColumnIndex outColumnIndex = outReader.readColumnIndex(outChunk);
        OffsetIndex outOffsetIndex = outReader.readOffsetIndex(outChunk);
        if (inColumnIndex != null) {
          Assert.assertEquals(inColumnIndex.getBoundaryOrder(), outColumnIndex.getBoundaryOrder());
          Assert.assertEquals(inColumnIndex.getMaxValues(), outColumnIndex.getMaxValues());
          Assert.assertEquals(inColumnIndex.getMinValues(), outColumnIndex.getMinValues());
          Assert.assertEquals(inColumnIndex.getNullCounts(), outColumnIndex.getNullCounts());
        }
        if (inOffsetIndex != null) {
          List<Long> inOffsets = getOffsets(inReader, inChunk);
          List<Long> outOffsets = getOffsets(outReader, outChunk);
          Assert.assertEquals(inOffsets.size(), outOffsets.size());
          Assert.assertEquals(inOffsets.size(), inOffsetIndex.getPageCount());
          Assert.assertEquals(inOffsetIndex.getPageCount(), outOffsetIndex.getPageCount());
          for (int k = 0; k < inOffsetIndex.getPageCount(); k++) {
            Assert.assertEquals(inOffsetIndex.getFirstRowIndex(k), outOffsetIndex.getFirstRowIndex(k));
            Assert.assertEquals(inOffsetIndex.getLastRowIndex(k, inChunk.getValueCount()),
              outOffsetIndex.getLastRowIndex(k, outChunk.getValueCount()));
            Assert.assertEquals(inOffsetIndex.getOffset(k), (long)inOffsets.get(k));
            Assert.assertEquals(outOffsetIndex.getOffset(k), (long)outOffsets.get(k));
          }
        }
      }
    }
  }
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

2 votes

/**
 * @param conf a configuration
 * @param file a file path to open
 * @return a parquet file reader
 * @throws IOException if there is an error while opening the file
 * @deprecated will be removed in 2.0.0; use {@link #open(InputFile)}
 */
@Deprecated
public static ParquetFileReader open(Configuration conf, Path file) throws IOException {
  return new ParquetFileReader(HadoopInputFile.fromPath(file, conf),
      HadoopReadOptions.builder(conf).build());
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

2 votes

/**
 * @param conf a configuration
 * @param file a file path to open
 * @param filter a metadata filter
 * @return a parquet file reader
 * @throws IOException if there is an error while opening the file
 * @deprecated will be removed in 2.0.0; use {@link #open(InputFile,ParquetReadOptions)}
 */
@Deprecated
public static ParquetFileReader open(Configuration conf, Path file, MetadataFilter filter) throws IOException {
  return open(HadoopInputFile.fromPath(file, conf),
      HadoopReadOptions.builder(conf).withMetadataFilter(filter).build());
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

2 votes

/**
 * @param conf the Hadoop Configuration
 * @param file Path to a parquet file
 * @param filter a {@link MetadataFilter} for selecting row groups
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(Configuration conf, Path file, MetadataFilter filter) throws IOException {
  this(HadoopInputFile.fromPath(file, conf),
      HadoopReadOptions.builder(conf).withMetadataFilter(filter).build());
}

org.apache.parquet.HadoopReadOptions Java Examples