org.apache.parquet.HadoopReadOptions Java Examples
The following examples show how to use
org.apache.parquet.HadoopReadOptions.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TransCompressionCommand.java From parquet-mr with Apache License 2.0 | 7 votes |
@Override @SuppressWarnings("unchecked") public int run() throws IOException { Preconditions.checkArgument(input != null && output != null, "Both input and output parquet file paths are required."); Preconditions.checkArgument(codec != null, "The codec cannot be null"); Path inPath = new Path(input); Path outPath = new Path(output); CompressionCodecName codecName = CompressionCodecName.valueOf(codec); ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER); MessageType schema = metaData.getFileMetaData().getSchema(); ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE); writer.start(); try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) { compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName); } finally { writer.end(metaData.getFileMetaData().getKeyValueMetaData()); } return 0; }
Example #2
Source File: TransCompressionCommand.java From parquet-mr with Apache License 2.0 | 6 votes |
@Override public void execute(CommandLine options) throws Exception { super.execute(options); List<String> args = options.getArgList(); Path inPath = new Path(args.get(0)); Path outPath = new Path(args.get(1)); CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2)); ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER); MessageType schema = metaData.getFileMetaData().getSchema(); ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE); writer.start(); try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) { compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName); } finally { writer.end(metaData.getFileMetaData().getKeyValueMetaData()); } }
Example #3
Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0 | 6 votes |
private void convertCompression(Configuration conf, String inputFile, String outputFile, String codec) throws IOException { Path inPath = new Path(inputFile); Path outPath = new Path(outputFile); CompressionCodecName codecName = CompressionCodecName.valueOf(codec); ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER); MessageType schema = metaData.getFileMetaData().getSchema(); ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE); writer.start(); try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) { compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName); } finally { writer.end(metaData.getFileMetaData().getKeyValueMetaData()); } }
Example #4
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * @param conf the Hadoop Configuration * @param file Path to a parquet file * @param footer a {@link ParquetMetadata} footer already read from the file * @throws IOException if the file can not be opened * @deprecated will be removed in 2.0.0. */ @Deprecated public ParquetFileReader(Configuration conf, Path file, ParquetMetadata footer) throws IOException { this.converter = new ParquetMetadataConverter(conf); this.file = HadoopInputFile.fromPath(file, conf); this.f = this.file.newStream(); this.options = HadoopReadOptions.builder(conf).build(); this.footer = footer; this.fileMetaData = footer.getFileMetaData(); this.blocks = filterRowGroups(footer.getBlocks()); this.blockIndexStores = listWithNulls(this.blocks.size()); this.blockRowRanges = listWithNulls(this.blocks.size()); for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) { paths.put(ColumnPath.get(col.getPath()), col); } this.crc = options.usePageChecksumVerification() ? new CRC32() : null; }
Example #5
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * @param configuration the Hadoop conf * @param fileMetaData fileMetaData for parquet file * @param filePath Path for the parquet file * @param blocks the blocks to read * @param columns the columns to read (their path) * @throws IOException if the file can not be opened * @deprecated will be removed in 2.0.0. */ @Deprecated public ParquetFileReader( Configuration configuration, FileMetaData fileMetaData, Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException { this.converter = new ParquetMetadataConverter(configuration); this.file = HadoopInputFile.fromPath(filePath, configuration); this.fileMetaData = fileMetaData; this.f = file.newStream(); this.options = HadoopReadOptions.builder(configuration).build(); this.blocks = filterRowGroups(blocks); this.blockIndexStores = listWithNulls(this.blocks.size()); this.blockRowRanges = listWithNulls(this.blocks.size()); for (ColumnDescriptor col : columns) { paths.put(ColumnPath.get(col.getPath()), col); } this.crc = options.usePageChecksumVerification() ? new CRC32() : null; }
Example #6
Source File: ParquetReader.java From parquet-mr with Apache License 2.0 | 5 votes |
@Deprecated protected Builder(Path path) { this.readSupport = null; this.file = null; this.path = Objects.requireNonNull(path, "path cannot be null"); this.conf = new Configuration(); this.optionsBuilder = HadoopReadOptions.builder(conf); }
Example #7
Source File: ParquetReader.java From parquet-mr with Apache License 2.0 | 5 votes |
protected Builder(InputFile file) { this.readSupport = null; this.file = Objects.requireNonNull(file, "file cannot be null"); this.path = null; if (file instanceof HadoopInputFile) { this.conf = ((HadoopInputFile) file).getConfiguration(); } else { this.conf = new Configuration(); } optionsBuilder = HadoopReadOptions.builder(conf); }
Example #8
Source File: ParquetReader.java From parquet-mr with Apache License 2.0 | 5 votes |
public Builder<T> withConf(Configuration conf) { this.conf = Objects.requireNonNull(conf, "conf cannot be null"); // previous versions didn't use the builder, so may set filter before conf. this maintains // compatibility for filter. other options are reset by a new conf. this.optionsBuilder = HadoopReadOptions.builder(conf); if (filter != null) { optionsBuilder.withRecordFilter(filter); } return this; }
Example #9
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 5 votes |
/** * Reads the meta data block in the footer of the file using provided input stream * @param file a {@link InputFile} to read * @param filter the filter to apply to row groups * @return the metadata blocks in the footer * @throws IOException if an error occurs while reading the file * @deprecated will be removed in 2.0.0; * use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)} */ @Deprecated public static final ParquetMetadata readFooter(InputFile file, MetadataFilter filter) throws IOException { ParquetReadOptions options; if (file instanceof HadoopInputFile) { options = HadoopReadOptions.builder(((HadoopInputFile) file).getConfiguration()) .withMetadataFilter(filter).build(); } else { options = ParquetReadOptions.builder().withMetadataFilter(filter).build(); } try (SeekableInputStream in = file.newStream()) { return readFooter(file, options, in); } }
Example #10
Source File: ParquetReader.java From parquet-mr with Apache License 2.0 | 5 votes |
@Deprecated private Builder(ReadSupport<T> readSupport, Path path) { this.readSupport = Objects.requireNonNull(readSupport, "readSupport cannot be null"); this.file = null; this.path = Objects.requireNonNull(path, "path cannot be null"); this.conf = new Configuration(); this.optionsBuilder = HadoopReadOptions.builder(conf); }
Example #11
Source File: ParquetReader.java From parquet-mr with Apache License 2.0 | 5 votes |
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, FilterCompat.Filter filter) throws IOException { this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)), HadoopReadOptions.builder(conf) .withRecordFilter(Objects.requireNonNull(filter, "filter cannot be null")) .build(), readSupport); }
Example #12
Source File: ParquetRecordReader.java From parquet-mr with Apache License 2.0 | 5 votes |
private void initializeInternalReader(ParquetInputSplit split, Configuration configuration) throws IOException { Path path = split.getPath(); long[] rowGroupOffsets = split.getRowGroupOffsets(); // if task.side.metadata is set, rowGroupOffsets is null ParquetReadOptions.Builder optionsBuilder = HadoopReadOptions.builder(configuration); if (rowGroupOffsets != null) { optionsBuilder.withOffsets(rowGroupOffsets); } else { optionsBuilder.withRange(split.getStart(), split.getEnd()); } // open a reader with the metadata filter ParquetFileReader reader = ParquetFileReader.open( HadoopInputFile.fromPath(path, configuration), optionsBuilder.build()); if (rowGroupOffsets != null) { // verify a row group was found for each offset List<BlockMetaData> blocks = reader.getFooter().getBlocks(); if (blocks.size() != rowGroupOffsets.length) { throw new IllegalStateException( "All of the offsets in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks); } } if (!reader.getRowGroups().isEmpty()) { checkDeltaByteArrayProblem( reader.getFooter().getFileMetaData(), configuration, reader.getRowGroups().get(0)); } internalReader.initialize(reader, configuration); }
Example #13
Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0 | 5 votes |
public void initialize(ParquetFileReader reader, ParquetReadOptions options) { // copy custom configuration to the Configuration passed to the ReadSupport Configuration conf = new Configuration(); if (options instanceof HadoopReadOptions) { conf = ((HadoopReadOptions) options).getConf(); } for (String property : options.getPropertyNames()) { conf.set(property, options.getProperty(property)); } // initialize a ReadContext for this file this.reader = reader; FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData(); this.fileSchema = parquetFileMetadata.getSchema(); Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData(); ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema)); this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy()); this.requestedSchema = readContext.getRequestedSchema(); this.columnCount = requestedSchema.getPaths().size(); // Setting the projection schema before running any filtering (e.g. getting filtered record count) // because projection impacts filtering reader.setRequestedSchema(requestedSchema); this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext); this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true); this.total = reader.getFilteredRecordCount(); this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total); this.filterRecords = options.useRecordFilter(); LOG.info("RecordReader initialized will read a total of {} records.", total); }
Example #14
Source File: ParquetFileAccessor.java From pxf with Apache License 2.0 | 5 votes |
/** * Reads the original schema from the parquet file. * * @param parquetFile the path to the parquet file * @param fileSplit the file split we are accessing * @return the original schema from the parquet file * @throws IOException when there's an IOException while reading the schema */ private MessageType getSchema(Path parquetFile, FileSplit fileSplit) throws IOException { final long then = System.nanoTime(); ParquetMetadataConverter.MetadataFilter filter = ParquetMetadataConverter.range( fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength()); ParquetReadOptions parquetReadOptions = HadoopReadOptions .builder(configuration) .withMetadataFilter(filter) .build(); HadoopInputFile inputFile = HadoopInputFile.fromPath(parquetFile, configuration); try (ParquetFileReader parquetFileReader = ParquetFileReader.open(inputFile, parquetReadOptions)) { FileMetaData metadata = parquetFileReader.getFileMetaData(); if (LOG.isDebugEnabled()) { LOG.debug("{}-{}: Reading file {} with {} records in {} RowGroups", context.getTransactionId(), context.getSegmentId(), parquetFile.getName(), parquetFileReader.getRecordCount(), parquetFileReader.getRowGroups().size()); } final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - then); LOG.debug("{}-{}: Read schema in {} ms", context.getTransactionId(), context.getSegmentId(), millis); return metadata.getSchema(); } catch (Exception e) { throw new IOException(e); } }
Example #15
Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0 | 4 votes |
private void validColumnIndex(String inputFile, String outFile) throws Exception { ParquetMetadata inMetaData = ParquetFileReader.readFooter(conf, new Path(inputFile), NO_FILTER); ParquetMetadata outMetaData = ParquetFileReader.readFooter(conf, new Path(outFile), NO_FILTER); Assert.assertEquals(inMetaData.getBlocks().size(), outMetaData.getBlocks().size()); try (TransParquetFileReader inReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(inputFile), conf), HadoopReadOptions.builder(conf).build()); TransParquetFileReader outReader = new TransParquetFileReader(HadoopInputFile.fromPath(new Path(outFile), conf), HadoopReadOptions.builder(conf).build())) { for (int i = 0; i < inMetaData.getBlocks().size(); i++) { BlockMetaData inBlockMetaData = inMetaData.getBlocks().get(i); BlockMetaData outBlockMetaData = outMetaData.getBlocks().get(i); Assert.assertEquals(inBlockMetaData.getColumns().size(), outBlockMetaData.getColumns().size()); for (int j = 0; j < inBlockMetaData.getColumns().size(); j++) { ColumnChunkMetaData inChunk = inBlockMetaData.getColumns().get(j); ColumnIndex inColumnIndex = inReader.readColumnIndex(inChunk); OffsetIndex inOffsetIndex = inReader.readOffsetIndex(inChunk); ColumnChunkMetaData outChunk = outBlockMetaData.getColumns().get(j); ColumnIndex outColumnIndex = outReader.readColumnIndex(outChunk); OffsetIndex outOffsetIndex = outReader.readOffsetIndex(outChunk); if (inColumnIndex != null) { Assert.assertEquals(inColumnIndex.getBoundaryOrder(), outColumnIndex.getBoundaryOrder()); Assert.assertEquals(inColumnIndex.getMaxValues(), outColumnIndex.getMaxValues()); Assert.assertEquals(inColumnIndex.getMinValues(), outColumnIndex.getMinValues()); Assert.assertEquals(inColumnIndex.getNullCounts(), outColumnIndex.getNullCounts()); } if (inOffsetIndex != null) { List<Long> inOffsets = getOffsets(inReader, inChunk); List<Long> outOffsets = getOffsets(outReader, outChunk); Assert.assertEquals(inOffsets.size(), outOffsets.size()); Assert.assertEquals(inOffsets.size(), inOffsetIndex.getPageCount()); Assert.assertEquals(inOffsetIndex.getPageCount(), outOffsetIndex.getPageCount()); for (int k = 0; k < inOffsetIndex.getPageCount(); k++) { Assert.assertEquals(inOffsetIndex.getFirstRowIndex(k), outOffsetIndex.getFirstRowIndex(k)); Assert.assertEquals(inOffsetIndex.getLastRowIndex(k, inChunk.getValueCount()), outOffsetIndex.getLastRowIndex(k, outChunk.getValueCount())); Assert.assertEquals(inOffsetIndex.getOffset(k), (long)inOffsets.get(k)); Assert.assertEquals(outOffsetIndex.getOffset(k), (long)outOffsets.get(k)); } } } } } }
Example #16
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 2 votes |
/** * @param conf a configuration * @param file a file path to open * @return a parquet file reader * @throws IOException if there is an error while opening the file * @deprecated will be removed in 2.0.0; use {@link #open(InputFile)} */ @Deprecated public static ParquetFileReader open(Configuration conf, Path file) throws IOException { return new ParquetFileReader(HadoopInputFile.fromPath(file, conf), HadoopReadOptions.builder(conf).build()); }
Example #17
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 2 votes |
/** * @param conf a configuration * @param file a file path to open * @param filter a metadata filter * @return a parquet file reader * @throws IOException if there is an error while opening the file * @deprecated will be removed in 2.0.0; use {@link #open(InputFile,ParquetReadOptions)} */ @Deprecated public static ParquetFileReader open(Configuration conf, Path file, MetadataFilter filter) throws IOException { return open(HadoopInputFile.fromPath(file, conf), HadoopReadOptions.builder(conf).withMetadataFilter(filter).build()); }
Example #18
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 2 votes |
/** * @param conf the Hadoop Configuration * @param file Path to a parquet file * @param filter a {@link MetadataFilter} for selecting row groups * @throws IOException if the file can not be opened * @deprecated will be removed in 2.0.0. */ @Deprecated public ParquetFileReader(Configuration conf, Path file, MetadataFilter filter) throws IOException { this(HadoopInputFile.fromPath(file, conf), HadoopReadOptions.builder(conf).withMetadataFilter(filter).build()); }