org.apache.parquet.hadoop.Footer Java Examples
The following examples show how to use
org.apache.parquet.hadoop.Footer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetReader.java From tajo with Apache License 2.0 | 6 votes |
private ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, Filter filter) throws IOException { this.readSupport = readSupport; this.filter = checkNotNull(filter, "filter"); this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false); this.footersIterator = footers.iterator(); for (Footer footer : footers) { for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) { totalRowCount += block.getRowCount(); } } }
Example #2
Source File: ParquetReader.java From tajo with Apache License 2.0 | 6 votes |
private void initReader() throws IOException { if (reader != null) { reader.close(); reader = null; } if (footersIterator.hasNext()) { Footer footer = footersIterator.next(); List<BlockMetaData> blocks = footer.getParquetMetadata().getBlocks(); MessageType fileSchema = footer.getParquetMetadata().getFileMetaData().getSchema(); List<BlockMetaData> filteredBlocks = RowGroupFilter.filterRowGroups( filter, blocks, fileSchema); reader = new InternalParquetRecordReader<T>(readSupport, filter); reader.initialize(footer.getParquetMetadata().getFileMetaData(), footer.getFile(), filteredBlocks, conf); } }
Example #3
Source File: ShowMetaCommand.java From parquet-mr with Apache License 2.0 | 6 votes |
@Override public void execute(CommandLine options) throws Exception { super.execute(options); String[] args = options.getArgs(); String input = args[0]; boolean showOriginalTypes = options.hasOption('o'); Configuration conf = new Configuration(); Path inputPath = new Path(input); FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath); List<Footer> footers = ParquetFileReader.readFooters(conf, inputFileStatus, false); PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter() .withAutoColumn() .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE) .withColumnPadding(1) .build(); for(Footer f: footers) { out.format("file: %s%n" , f.getFile()); MetadataUtils.showDetails(out, f.getParquetMetadata(), showOriginalTypes); out.flushColumns(); } }
Example #4
Source File: DeprecatedParquetInputFormat.java From parquet-mr with Apache License 2.0 | 6 votes |
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { if (isTaskSideMetaData(job)) { return super.getSplits(job, numSplits); } List<Footer> footers = getFooters(job); List<ParquetInputSplit> splits = realInputFormat.getSplits(job, footers); if (splits == null) { return null; } InputSplit[] resultSplits = new InputSplit[splits.size()]; int i = 0; for (ParquetInputSplit split : splits) { resultSplits[i++] = new ParquetInputSplitWrapper(split); } return resultSplits; }
Example #5
Source File: PentahoApacheInputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
@Override public List<IParquetInputField> readSchema( String file ) throws Exception { return inClassloader( () -> { Configuration conf = job.getConfiguration(); S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf ); Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) ); FileSystem fs = FileSystem.get( filePath.toUri(), conf ); FileStatus fileStatus = fs.getFileStatus( filePath ); List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true ); if ( footers.isEmpty() ) { return new ArrayList<>(); } else { ParquetMetadata meta = footers.get( 0 ).getParquetMetadata(); MessageType schema = meta.getFileMetaData().getSchema(); return ParquetConverter.buildInputFields( schema ); } } ); }
Example #6
Source File: FooterGatherer.java From Bats with Apache License 2.0 | 5 votes |
public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException { final List<TimedCallable<Footer>> readers = new ArrayList<>(); final List<Footer> foundFooters = new ArrayList<>(); for (FileStatus status : statuses) { if (status.isDirectory()){ // first we check for summary file. FileSystem fs = status.getPath().getFileSystem(conf); final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE); if (fs.exists(summaryPath)){ FileStatus summaryStatus = fs.getFileStatus(summaryPath); foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus)); continue; } // else we handle as normal file. for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)){ readers.add(new FooterReader(conf, inStatus)); } } else { readers.add(new FooterReader(conf, status)); } } if(!readers.isEmpty()){ foundFooters.addAll(TimedCallable.run("Fetch Parquet Footers", logger, readers, parallelism)); } return foundFooters; }
Example #7
Source File: FooterGatherer.java From Bats with Apache License 2.0 | 5 votes |
/** * An updated footer reader that tries to read the entire footer without knowing the length. * This should reduce the amount of seek/read roundtrips in most workloads. * @param fs * @param status * @return * @throws IOException */ public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException { final FileSystem fs = status.getPath().getFileSystem(config); try(FSDataInputStream file = fs.open(status.getPath())) { final long fileLength = status.getLen(); Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath()); int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE); byte[] footerBytes = new byte[len]; readFully(file, fileLength - len, footerBytes, 0, len); checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length); final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE); if(size > footerBytes.length - FOOTER_METADATA_SIZE){ // if the footer is larger than our initial read, we need to read the rest. byte[] origFooterBytes = footerBytes; int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE; footerBytes = new byte[size]; readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead); System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead); }else{ int start = footerBytes.length - (size + FOOTER_METADATA_SIZE); footerBytes = ArrayUtils.subarray(footerBytes, start, start + size); } final ByteArrayInputStream from = new ByteArrayInputStream(footerBytes); ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(from, NO_FILTER); Footer footer = new Footer(status.getPath(), metadata); return footer; } }
Example #8
Source File: ParquetRecordReaderTest.java From dremio-oss with Apache License 2.0 | 5 votes |
private void validateFooters(final List<Footer> metadata) { logger.debug(metadata.toString()); assertEquals(3, metadata.size()); for (Footer footer : metadata) { final File file = new File(footer.getFile().toUri()); assertTrue(file.getName(), file.getName().startsWith("part")); assertTrue(file.getPath(), file.exists()); final ParquetMetadata parquetMetadata = footer.getParquetMetadata(); assertEquals(2, parquetMetadata.getBlocks().size()); final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData(); assertEquals("bar", keyValueMetaData.get("foo")); assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName())); } }
Example #9
Source File: RowCountCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void execute(CommandLine options) throws Exception { super.execute(options); String[] args = options.getArgs(); String input = args[0]; out = new PrintWriter(Main.out, true); inputPath = new Path(input); conf = new Configuration(); inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath); long rowCount = 0; for (FileStatus fs : inputFileStatuses) { long fileRowCount=0; for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) { for (BlockMetaData b : f.getParquetMetadata().getBlocks()) { rowCount += b.getRowCount(); fileRowCount += b.getRowCount(); } } if (options.hasOption('d')) { out.format("%s row count: %d\n", fs.getPath().getName(), fileRowCount); } } out.format("Total RowCount: %d", rowCount); out.println(); }
Example #10
Source File: SizeCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void execute(CommandLine options) throws Exception { super.execute(options); String[] args = options.getArgs(); String input = args[0]; out = new PrintWriter(Main.out, true); inputPath = new Path(input); conf = new Configuration(); inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath); long size = 0; for (FileStatus fs : inputFileStatuses) { long fileSize = 0; for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) { for (BlockMetaData b : f.getParquetMetadata().getBlocks()) { size += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize()); fileSize += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize()); } } if (options.hasOption('d')) { if (options.hasOption('p')) { out.format("%s: %s\n", fs.getPath().getName(), getPrettySize(fileSize)); } else { out.format("%s: %d bytes\n", fs.getPath().getName(), fileSize); } } } if (options.hasOption('p')) { out.format("Total Size: %s", getPrettySize(size)); } else { out.format("Total Size: %d bytes", size); } out.println(); }
Example #11
Source File: FooterGatherer.java From Bats with Apache License 2.0 | 4 votes |
@Override protected Footer runInner() throws Exception { return readFooter(conf, status); }
Example #12
Source File: ParquetTupleScheme.java From parquet-mr with Apache License 2.0 | 4 votes |
private List<Footer> getFooters(FlowProcess<JobConf> flowProcess, Hfs hfs) throws IOException { JobConf jobConf = flowProcess.getConfigCopy(); DeprecatedParquetInputFormat format = new DeprecatedParquetInputFormat(); format.addInputPath(jobConf, hfs.getPath()); return format.getFooters(jobConf); }
Example #13
Source File: DeprecatedParquetInputFormat.java From parquet-mr with Apache License 2.0 | 4 votes |
public List<Footer> getFooters(JobConf job) throws IOException { return realInputFormat.getFooters(job, asList(super.listStatus(job))); }
Example #14
Source File: ParquetTupleScheme.java From parquet-mr with Apache License 2.0 | 4 votes |
private List<Footer> getFooters(FlowProcess<? extends JobConf> flowProcess, Hfs hfs) throws IOException { JobConf jobConf = flowProcess.getConfigCopy(); DeprecatedParquetInputFormat format = new DeprecatedParquetInputFormat(); format.addInputPath(jobConf, hfs.getPath()); return format.getFooters(jobConf); }
Example #15
Source File: ParquetAsTextInputFormat.java From iow-hadoop-streaming with Apache License 2.0 | 4 votes |
public List<Footer> getFooters(JobConf job) throws IOException { return realInputFormat.getFooters(job, Arrays.asList(super.listStatus(job))); }