java source code of ParquetFileReader

parquet-mr-master
- .github
  - PULL_REQUEST_TEMPLATE.md
- parquet-hive
  - parquet-hive-binding
    - parquet-hive-0.12-binding
      - src
        main
        java
        org
        apache
        parquet
        hive
        internal
        Hive012Binding.java
      - pom.xml
    - parquet-hive-binding-bundle
      - src
        main
        resources
        org
        apache
        parquet
        bundle
        assemble
        uberjar.xml
      - pom.xml
    - pom.xml
    - parquet-hive-0.10-binding
      - src
        main
        java
        org
        apache
        parquet
        hive
        internal
        Hive010Binding.java
      - pom.xml
    - parquet-hive-binding-interface
      - src
        main
        java
        org
        apache
        parquet
        hive
        internal
        AbstractHiveBinding.java
        HiveBinding.java
      - pom.xml
    - parquet-hive-binding-factory
      - src
        main
        java
        org
        apache
        parquet
        hive
        HiveBindingFactory.java
        test
        java
        org
        apache
        parquet
        hive
        TestHiveBindingFactory.java
      - pom.xml
  - pom.xml
  - parquet-hive-storage-handler
    - src
      - main
        java
        org
        apache
        hadoop
        hive
        serde2
        objectinspector
        primitive
        ParquetStringInspector.java
        ql
        io
        IOConstants.java
        parquet
        writable
        BinaryWritable.java
        BigDecimalWritable.java
        MapredParquetOutputFormat.java
        convert
        DataWritableGroupConverter.java
        ETypeConverter.java
        DataWritableRecordConverter.java
        HiveSchemaConverter.java
        HiveGroupConverter.java
        ArrayWritableGroupConverter.java
        write
        DataWritableWriter.java
        ParquetRecordWriterWrapper.java
        DataWritableWriteSupport.java
        MapredParquetInputFormat.java
        serde
        ParquetHiveArrayInspector.java
        AbstractParquetMapInspector.java
        ArrayWritableObjectInspector.java
        primitive
        ParquetShortInspector.java
        ParquetByteInspector.java
        ParquetPrimitiveInspectorFactory.java
        StandardParquetHiveMapInspector.java
        ParquetHiveSerDe.java
        DeepParquetHiveMapInspector.java
        read
        DataWritableReadSupport.java
        ParquetRecordReaderWrapper.java
        parquet
        hive
        DeprecatedParquetOutputFormat.java
        MapredParquetOutputFormat.java
        DeprecatedParquetInputFormat.java
        MapredParquetInputFormat.java
        serde
        ParquetHiveSerDe.java
      - test
        java
        org
        apache
        hadoop
        hive
        ql
        io
        parquet
        TestMapredParquetInputFormat.java
        TestMapredParquetOutputFormat.java
        TestHiveSchemaConverter.java
        serde
        TestAbstractParquetMapInspector.java
        TestDeepParquetHiveMapInspector.java
        TestParquetHiveArrayInspector.java
        TestStandardParquetHiveMapInspector.java
        TestParquetSerDe.java
    - pom.xml
  - REVIEWERS.md
- src
  - license.txt
- parquet-benchmarks
  - src
    - main
      - resources
        log4j.properties
      - java
        org
        apache
        parquet
        benchmarks
        PageChecksumDataGenerator.java
        BenchmarkUtils.java
        PageChecksumReadBenchmarks.java
        BenchmarkFiles.java
        NestedNullWritingBenchmarks.java
        FilteringBenchmarks.java
        WriteBenchmarks.java
        ReadBenchmarks.java
        BenchmarkConstants.java
        PageChecksumWriteBenchmarks.java
        DataGenerator.java
  - pom.xml
  - run.sh
  - README.md
- parquet-hadoop
  - src
    - main
      - resources
        META-INF
        LICENSE
      - java
        org
        apache
        parquet
        filter2
        bloomfilterlevel
        BloomFilterImpl.java
        compat
        RowGroupFilter.java
        dictionarylevel
        DictionaryFilter.java
        statisticslevel
        StatisticsFilter.java
        hadoop
        InternalParquetRecordReader.java
        ColumnConfigParser.java
        ParquetRecordWriter.java
        BloomFilterReader.java
        ColumnChunkPageReadStore.java
        ColumnIndexFilterUtils.java
        Footer.java
        codec
        CodecConfig.java
        ZstdDecompressorStream.java
        SnappyUtil.java
        ZstandardCodec.java
        NonBlockedCompressorStream.java
        SnappyCompressor.java
        ZstdCompressorStream.java
        SnappyDecompressor.java
        CleanUtil.java
        SnappyCodec.java
        NonBlockedDecompressorStream.java
        BadConfigurationException.java
        LruCache.java
        metadata
        EncodingList.java
        ColumnChunkProperties.java
        ParquetMetadata.java
        GlobalMetaData.java
        BlockMetaData.java
        ColumnChunkMetaData.java
        FileMetaData.java
        MemoryManager.java
        CodecFactory.java
        api
        InitContext.java
        DelegatingReadSupport.java
        WriteSupport.java
        package-info.java
        ReadSupport.java
        DelegatingWriteSupport.java
        ParquetWriter.java
        util
        ContextUtil.java
        HadoopInputFile.java
        CompressionConverter.java
        HiddenFileFilter.java
        HadoopPositionOutputStream.java
        SerializationUtil.java
        counters
        BenchmarkCounter.java
        ICounter.java
        mapred
        MapRedCounterAdapter.java
        MapRedCounterLoader.java
        CounterLoader.java
        mapreduce
        MapReduceCounterLoader.java
        MapReduceCounterAdapter.java
        H2SeekableInputStream.java
        HadoopOutputFile.java
        HadoopCodecs.java
        ConfigurationUtil.java
        HadoopStreams.java
        H1SeekableInputStream.java
        UnmaterializableRecordCounter.java
        mapred
        MapredParquetOutputCommitter.java
        DeprecatedParquetOutputFormat.java
        DeprecatedParquetInputFormat.java
        Container.java
        ParquetOutputFormat.java
        ParquetFileWriter.java
        ParquetOutputCommitter.java
        package-info.java
        example
        ExampleParquetWriter.java
        ExampleInputFormat.java
        GroupWriteSupport.java
        GroupReadSupport.java
        ExampleOutputFormat.java
        ParquetReader.java
        PrintFooter.java
        DirectCodecFactory.java
        ColumnIndexValidator.java
        ColumnIndexStoreImpl.java
        ParquetInputFormat.java
        ParquetInputSplit.java
        ParquetFileReader.java
        ColumnChunkPageWriteStore.java
        DictionaryPageReader.java
        InternalParquetRecordWriter.java
        ParquetRecordReader.java
        ParquetReadOptions.java
        HadoopReadOptions.java
        format
        converter
        ParquetMetadataConverter.java
        internal
        hadoop
        metadata
        IndexReference.java
    - test
      - resources
        log4j.properties
      - java
        org
        apache
        parquet
        encodings
        FileEncodingsIT.java
        statistics
        RandomValues.java
        TestStatistics.java
        TestColumnIndexes.java
        DirectWriterTest.java
        filter2
        recordlevel
        PhoneBookWriter.java
        TestRecordLevelFilters.java
        compat
        TestRowGroupFilter.java
        dictionarylevel
        DictionaryFilterTest.java
        statisticslevel
        TestStatisticsFilter.java
        TestFiltersWithMissingColumns.java
        hadoop
        DeprecatedInputFormatTest.java
        TestReadWriteEncodingStats.java
        TestBloomFiltering.java
        TestDirectCodecFactory.java
        TestParquetWriter.java
        TestParquetFileWriter.java
        TestParquetWriterNewPage.java
        TestMemoryManager.java
        codec
        CodecConfigTest.java
        TestDataPageV1Checksums.java
        TestLruCache.java
        metadata
        TestColumnChunkMetaData.java
        TestSnappyCodec.java
        TestUtils.java
        util
        TestSerializationUtil.java
        CompressionConveterTest.java
        TestHadoop2ByteBufferReads.java
        MockHadoopInputStream.java
        DeprecatedOutputFormatTest.java
        TestParquetOutputFormatJobSummaryLevel.java
        TestZstandardCodec.java
        TestColumnIndexFiltering.java
        example
        TestInputOutputFormat.java
        GroupReadSupportTest.java
        TestColumnChunkPageWriteStore.java
        TestParquetWriterAppendBlocks.java
        TestInputOutputFormatWithPadding.java
        TestInputFormatColumnProjection.java
        TestMergeMetadataFiles.java
        TestMultipleWriteRead.java
        TestInputFormat.java
        format
        converter
        TestParquetMetadataConverter.java
  - pom.xml
  - README.md
  - REVIEWERS.md
- parquet-cascading
  - src
    - main
      - java
        org
        apache
        parquet
        cascading
        ParquetValueScheme.java
        ParquetTupleScheme.java
        ParquetTBaseScheme.java
    - test
      - java
        org
        apache
        parquet
        cascading
        TestParquetTBaseScheme.java
  - pom.xml
  - .cache
  - REVIEWERS.md
- parquet-common
  - src
    - main
      - java
        org
        apache
        parquet
        Preconditions.java
        IOExceptionUtils.java
        VersionParser.java
        bytes
        LittleEndianDataOutputStream.java
        BytesInput.java
        CapacityByteArrayOutputStream.java
        SingleBufferInputStream.java
        BytesUtils.java
        DirectByteBufferAllocator.java
        ByteBufferInputStream.java
        ByteBufferAllocator.java
        ConcatenatingByteArrayCollector.java
        HeapByteBufferAllocator.java
        MultiBufferInputStream.java
        LittleEndianDataInputStream.java
        hadoop
        codec
        CompressionCodecNotSupportedException.java
        metadata
        Canonicalizer.java
        CompressionCodecName.java
        ColumnPath.java
        Closeables.java
        SemanticVersion.java
        Log.java
        util
        DynConstructors.java
        DynMethods.java
        Exceptions.java
        Files.java
        ParquetRuntimeException.java
        OutputStreamCloseException.java
        compression
        CompressionCodecFactory.java
        io
        PositionOutputStream.java
        DelegatingSeekableInputStream.java
        SeekableInputStream.java
        InputFile.java
        OutputFile.java
        DelegatingPositionOutputStream.java
        ShouldNeverHappenException.java
        glob
        GlobNode.java
        WildcardPath.java
        GlobParser.java
        GlobExpander.java
        Ints.java
        Strings.java
    - test
      - java
        org
        apache
        parquet
        TestPreconditions.java
        VersionTest.java
        bytes
        TestMultiBufferInputStream.java
        TestSingleBufferInputStream.java
        TestByteBufferInputStreams.java
        TestDeprecatedBufferInputStream.java
        TestBytesUtil.java
        SemanticVersionTest.java
        TestUtils.java
        util
        Concatenator.java
        TestDynConstructors.java
        TestDynMethods.java
        io
        TestDelegatingSeekableInputStream.java
        MockInputStream.java
        glob
        TestWildcardPath.java
        TestGlob.java
  - pom.xml
  - REVIEWERS.md
- parquet-encoding
  - src
    - main
      - resources
        META-INF
        LICENSE
      - java
        org
        apache
        parquet
        column
        values
        bitpacking
        BitPacking.java
        IntPackerFactory.java
        BytePackerForLongFactory.java
        ByteBasedBitPackingEncoder.java
        BytePacker.java
        IntPacker.java
        BytePackerFactory.java
        BytePackerForLong.java
        Packer.java
    - test
      - java
        org
        apache
        parquet
        bytes
        TestBytesInput.java
        TestCapacityByteArrayOutputStream.java
        column
        values
        bitpacking
        TestLemireBitPacking.java
        TestBitPacking.java
        TestByteBitPacking.java
        TestByteBasedBitPackingEncoder.java
  - pom.xml
  - REVIEWERS.md
- parquet-hadoop-bundle
  - src
    - main
      - resources
        META-INF
        LICENSE
        org
        apache
        parquet
        bundle
  - pom.xml
  - README
- PoweredBy.md
- pom.xml
- parquet_cascading.md
- parquet-scala
  - src
    - main
      - scala
        org
        apache
        parquet
        filter2
        dsl
        Dsl.scala
    - test
      - scala
        org
        apache
        parquet
        filter2
        dsl
        DslTest.scala
  - pom.xml
- LICENSE
- parquet-pig-bundle
  - src
    - main
      - resources
        META-INF
        LICENSE
        org
        apache
        parquet
        bundle
  - pom.xml
- CHANGES.md
- parquet-generator
  - src
    - main
      - resources
        parquet-version.properties
        META-INF
        LICENSE
      - java
        org
        apache
        parquet
        version
        VersionGenerator.java
        Generator.java
        filter2
        IncrementallyUpdatedFilterPredicateGenerator.java
        Generator.java
        encoding
        bitpacking
        IntBasedBitPackingGenerator.java
        ByteBasedBitPackingGenerator.java
        Generator.java
  - pom.xml
  - REVIEWERS.md
- .editorconfig
- parquet-protobuf
  - src
    - main
      - resources
        META-INF
        NOTICE
      - java
        org
        apache
        parquet
        proto
        ProtoParquetOutputFormat.java
        ProtoParquetWriter.java
        ProtoReadSupport.java
        ProtoRecordConverter.java
        ProtoParquetReader.java
        ProtoMessageConverter.java
        ProtoSchemaConverter.java
        ProtoParquetInputFormat.java
        ProtoWriteSupport.java
        ProtoRecordMaterializer.java
    - test
      - resources
        log4j.properties
        TestProtobuf.proto
        TestProto3.proto
      - java
        org
        apache
        parquet
        proto
        utils
        WriteUsingMR.java
        ReadUsingMR.java
        ProtoInputOutputFormatTest.java
        ProtoRecordConverterTest.java
        TestUtils.java
        ProtoSchemaConverterTest.java
        ProtoWriteSupportTest.java
  - pom.xml
  - README.md
  - REVIEWERS.md
- parquet-cascading-common23
  - src
    - main
      - java
        org
        apache
        parquet
        cascading
        TupleReadSupport.java
        SchemaIntersection.java
        convert
        TupleRecordMaterializer.java
        TupleConverter.java
        TupleWriteSupport.java
    - test
      - resources
        names.txt
      - java
        org
        apache
        parquet
        cascading
        TestParquetTupleScheme.java
      - thrift
        test.thrift
- dev
  - source-release.sh
  - travis-before_install-master.sh
  - merge_parquet_pr.py
  - COMMITTERS.md
  - travis-before_install.sh
  - prepare-release.sh
  - README.md
  - finalize-release
- parquet-cascading3
  - src
    - main
      - java
        org
        apache
        parquet
        cascading
        ParquetValueScheme.java
        ParquetTupleScheme.java
        ParquetTBaseScheme.java
    - test
      - java
        org
        apache
        parquet
        cascading
        TestParquetTBaseScheme.java
  - pom.xml
  - REVIEWERS.md
- parquet-tools
  - src
    - main
      - resources
        META-INF
        LICENSE
        NOTICE
      - java
        org
        apache
        parquet
        tools
        util
        MetadataUtils.java
        PrettyPrintWriter.java
        Main.java
        json
        JsonRecordFormatter.java
        command
        PruneColumnsCommand.java
        ArgsOnlyCommand.java
        RowCountCommand.java
        ColumnIndexCommand.java
        MergeCommand.java
        ShowMetaCommand.java
        ShowSchemaCommand.java
        Command.java
        HeadCommand.java
        SizeCommand.java
        MetadataUtils.java
        TransCompressionCommand.java
        DumpCommand.java
        Registry.java
        ColumnSizeCommand.java
        CatCommand.java
        read
        SimpleMapRecord.java
        SimpleListRecordConverter.java
        SimpleListRecord.java
        SimpleReadSupport.java
        SimpleMapRecordConverter.java
        SimpleRecord.java
        SimpleRecordMaterializer.java
        SimpleRecordConverter.java
      - scripts
        parquet-size
        parquet-cat
        parquet-schema
        parquet-meta
        parquet-tools
        parquet-merge
        parquet-rowcount
        parquet-head
        parquet-dump
      - assembly
        assembly.xml
    - test
      - java
        org
        apache
        parquet
        tools
        command
        TestPruneColumnsCommand.java
        TestColumnSizeCommand.java
        read
        TestSimpleRecordConverter.java
        TestSimpleMapRecord.java
        TestJsonRecordFormatter.java
        TestSimplePrimitiveRecord.java
  - pom.xml
  - README.md
  - REVIEWERS.md
- parquet-cli
  - src
    - main
      - resources
        META-INF
        LICENSE
        NOTICE
        cli-logging.properties
      - java
        org
        apache
        parquet
        cli
        HadoopFileSystemURLStreamHandler.java
        csv
        RecordBuilder.java
        CSVProperties.java
        AvroCSVReader.java
        AvroCSV.java
        Help.java
        commands
        ShowColumnIndexCommand.java
        ParquetMetadataCommand.java
        CSVSchemaCommand.java
        ShowDictionaryCommand.java
        ShowPagesCommand.java
        SchemaCommand.java
        ConvertCSVCommand.java
        TransCompressionCommand.java
        ConvertCommand.java
        ToAvroCommand.java
        CheckParquet251Command.java
        ColumnSizeCommand.java
        CatCommand.java
        util
        Expressions.java
        RuntimeIOException.java
        RecordException.java
        SeekableFSDataInputStream.java
        GetClassLoader.java
        Formats.java
        Schemas.java
        Codecs.java
        Main.java
        Command.java
        Util.java
        BaseCommand.java
        json
        AvroJsonReader.java
        AvroJson.java
    - test
      - java
        org
        apache
        parquet
        cli
        BaseCommandTest.java
        commands
        SchemaCommandTest.java
        ColumnSizeCommandTest.java
        CSVSchemaCommandTest.java
        ParquetFileTest.java
        ShowDictionaryCommandTest.java
        CatCommandTest.java
        CSVFileTest.java
        ToAvroCommandTest.java
        ParquetMetadataCommandTest.java
        ShowColumnIndexTest.java
        ConvertCommandTest.java
        CheckParquet251CommandTest.java
        ShowPagesCommandTest.java
        FileTest.java
        ConvertCSVCommandTest.java
        AvroFileTest.java
  - pom.xml
  - README.md
- parquet-avro
  - src
    - main
      - resources
        META-INF
        LICENSE
        NOTICE
      - java
        org
        apache
        parquet
        avro
        ReflectDataSupplier.java
        AvroParquetWriter.java
        GenericDataSupplier.java
        AvroParquetOutputFormat.java
        AvroSchemaConverter.java
        AvroParquetInputFormat.java
        ParentValueContainer.java
        AvroParquetReader.java
        package-info.java
        AvroWriteSupport.java
        AvroRecordConverter.java
        AvroConverters.java
        AvroCompatRecordMaterializer.java
        AvroDataSupplier.java
        AvroIndexedRecordConverter.java
        AvroReadSupport.java
        AvroRecordMaterializer.java
        SpecificDataSupplier.java
    - test
      - avro
        stringBehavior.avsc
      - resources
        car.avdl
        nested_array.avsc
        allFromParquetOldBehavior.avsc
        map.avsc
        strings-2.parquet
        map_with_nulls.avsc
        allFromParquetNewBehavior.avsc
        all.avsc
        array.avsc
      - java
        org
        apache
        parquet
        avro
        TestAvroSchemaConverter.java
        TestReadWrite.java
        TestInputOutputFormat.java
        TestCircularReferences.java
        AvroTestUtil.java
        TestSpecificInputOutputFormat.java
        TestSpecificReadWrite.java
        TestGenericLogicalTypes.java
        TestAvroDataSupplier.java
        TestArrayCompatibility.java
        TestReflectInputOutputFormat.java
        TestReflectLogicalTypes.java
        TestStringBehavior.java
        TestReflectReadWrite.java
        TestReadWriteOldListBehavior.java
        TestBackwardCompatibility.java
  - pom.xml
  - README.md
  - REVIEWERS.md
- parquet-column
  - src
    - main
      - resources
        META-INF
        LICENSE
      - java
        org
        apache
        parquet
        filter2
        recordlevel
        IncrementallyUpdatedFilterPredicate.java
        FilteringPrimitiveConverter.java
        IncrementallyUpdatedFilterPredicateResetter.java
        FilteringGroupConverter.java
        FilteringRecordMaterializer.java
        IncrementallyUpdatedFilterPredicateEvaluator.java
        IncrementallyUpdatedFilterPredicateBuilderBase.java
        compat
        FilterCompat.java
        predicate
        FilterPredicate.java
        Operators.java
        FilterApi.java
        PrimitiveToBoxedClass.java
        SchemaCompatibilityValidator.java
        UserDefinedPredicate.java
        LogicalInverter.java
        Statistics.java
        LogicalInverseRewriter.java
        ValidTypeMap.java
        internal
        filter2
        columnindex
        ColumnIndexFilter.java
        RowRanges.java
        ColumnIndexStore.java
        column
        columnindex
        OffsetIndex.java
        LongColumnIndexBuilder.java
        BinaryColumnIndexBuilder.java
        BoundaryOrder.java
        ColumnIndex.java
        DoubleColumnIndexBuilder.java
        BinaryTruncator.java
        IndexIterator.java
        IntColumnIndexBuilder.java
        ColumnIndexBuilder.java
        OffsetIndexBuilder.java
        BooleanColumnIndexBuilder.java
        FloatColumnIndexBuilder.java
        example
        DummyRecordConverter.java
        data
        simple
        DoubleValue.java
        SimpleGroup.java
        BooleanValue.java
        NanoTime.java
        BinaryValue.java
        convert
        SimpleGroupConverter.java
        GroupRecordConverter.java
        SimplePrimitiveConverter.java
        IntegerValue.java
        Int96Value.java
        FloatValue.java
        SimpleGroupFactory.java
        LongValue.java
        Primitive.java
        GroupValueSource.java
        GroupFactory.java
        GroupWriter.java
        Group.java
        Paper.java
        io
        PrimitiveColumnIO.java
        ParquetEncodingException.java
        ParquetDecodingException.java
        ColumnIOFactory.java
        BaseRecordReader.java
        MessageColumnIO.java
        api
        PrimitiveConverter.java
        Binary.java
        Converter.java
        GroupConverter.java
        RecordConsumer.java
        RecordMaterializer.java
        EmptyRecordReader.java
        GroupColumnIO.java
        RecordReader.java
        CompilationException.java
        FilteredRecordReader.java
        ColumnIO.java
        InvalidRecordException.java
        RecordConsumerLoggingWrapper.java
        ValidatingRecordConsumer.java
        RecordReaderImplementation.java
        column
        impl
        ColumnWriterBase.java
        ColumnWriterV1.java
        SynchronizingColumnReader.java
        ColumnReaderImpl.java
        ColumnReaderBase.java
        ColumnWriteStoreBase.java
        ColumnWriterV2.java
        ColumnWriteStoreV2.java
        ColumnWriteStoreV1.java
        ColumnReadStoreImpl.java
        page
        PageWriteStore.java
        PageReadStore.java
        PageReader.java
        DataPageV2.java
        PageWriter.java
        DataPageV1.java
        DictionaryPage.java
        DataPage.java
        DictionaryPageReadStore.java
        Page.java
        statistics
        StatisticsClassException.java
        DoubleStatistics.java
        LongStatistics.java
        BooleanStatistics.java
        IntStatistics.java
        FloatStatistics.java
        BinaryStatistics.java
        Statistics.java
        Encoding.java
        Dictionary.java
        EncodingStats.java
        ColumnProperty.java
        UnknownColumnException.java
        ParquetProperties.java
        ColumnReader.java
        ColumnDescriptor.java
        ValuesType.java
        UnknownColumnTypeException.java
        ColumnWriter.java
        ColumnReadStore.java
        ColumnWriteStore.java
        values
        RequiresPreviousReader.java
        dictionary
        DictionaryValuesWriter.java
        IntList.java
        PlainValuesDictionary.java
        DictionaryValuesReader.java
        RequiresFallback.java
        deltastrings
        DeltaByteArrayReader.java
        DeltaByteArrayWriter.java
        plain
        FixedLenByteArrayPlainValuesWriter.java
        BooleanPlainValuesWriter.java
        BooleanPlainValuesReader.java
        FixedLenByteArrayPlainValuesReader.java
        BinaryPlainValuesReader.java
        PlainValuesWriter.java
        PlainValuesReader.java
        ValuesReader.java
        fallback
        FallbackValuesWriter.java
        ValuesWriter.java
        bloomfilter
        BloomFilterWriteStore.java
        BloomFilter.java
        BloomFilterWriter.java
        BlockSplitBloomFilter.java
        HashFunction.java
        XxHash.java
        delta
        DeltaBinaryPackingValuesReader.java
        DeltaBinaryPackingValuesWriter.java
        DeltaBinaryPackingValuesWriterForInteger.java
        DeltaBinaryPackingValuesWriterForLong.java
        DeltaBinaryPackingConfig.java
        factory
        DefaultValuesWriterFactory.java
        DefaultV2ValuesWriterFactory.java
        DefaultV1ValuesWriterFactory.java
        ValuesWriterFactory.java
        deltalengthbytearray
        DeltaLengthByteArrayValuesWriter.java
        DeltaLengthByteArrayValuesReader.java
        bitpacking
        ByteBitPackingValuesWriter.java
        ByteBitPackingValuesReader.java
        BitPackingValuesWriter.java
        BitPackingValuesReader.java
        DevNullValuesWriter.java
        bytestreamsplit
        ByteStreamSplitValuesReaderForFloat.java
        ByteStreamSplitValuesReader.java
        ByteStreamSplitValuesReaderForDouble.java
        ByteStreamSplitValuesWriter.java
        rle
        RunLengthBitPackingHybridValuesWriter.java
        ZeroIntegerValuesReader.java
        RunLengthBitPackingHybridValuesReader.java
        RunLengthBitPackingHybridEncoder.java
        RunLengthBitPackingHybridDecoder.java
        filter
        ColumnPredicates.java
        ColumnRecordFilter.java
        UnboundRecordFilter.java
        NotRecordFilter.java
        RecordFilter.java
        OrRecordFilter.java
        AndRecordFilter.java
        PagedRecordFilter.java
        schema
        OriginalType.java
        TypeConverter.java
        ColumnOrder.java
        TypeVisitor.java
        Types.java
        InvalidSchemaException.java
        PrimitiveComparator.java
        TypeUtil.java
        DecimalMetadata.java
        MessageTypeParser.java
        PrimitiveStringifier.java
        ConversionPatterns.java
        IncompatibleSchemaModificationException.java
        Type.java
        MessageType.java
        LogicalTypeAnnotation.java
        PrimitiveType.java
        GroupType.java
        CorruptDeltaByteArrays.java
        CorruptStatistics.java
    - test
      - java
        org
        apache
        parquet
        filter2
        recordlevel
        TestIncrementallyUpdatedFilterPredicateResetter.java
        TestValueInspector.java
        TestIncrementallyUpdatedFilterPredicateEvaluator.java
        predicate
        DummyUdp.java
        TestSchemaCompatibilityValidator.java
        TestLogicalInverter.java
        TestValidTypeMap.java
        TestFilterApiMethods.java
        TestLogicalInverseRewriter.java
        CorruptStatisticsTest.java
        FixedBinaryTestUtils.java
        internal
        filter2
        columnindex
        TestRowRanges.java
        TestColumnIndexFilter.java
        column
        columnindex
        TestColumnIndexBuilder.java
        TestOffsetIndexBuilder.java
        TestBoundaryOrder.java
        TestBinaryTruncator.java
        TestIndexIterator.java
        io
        ConverterConsumer.java
        ExpectationValidatingRecordConsumer.java
        api
        TestBinary.java
        TestColumnIO.java
        TestFiltered.java
        ExpectationValidatingConverter.java
        PerfTest.java
        column
        impl
        TestColumnReaderImpl.java
        TestCorruptDeltaByteArrays.java
        page
        mem
        MemPageWriter.java
        MemPageStore.java
        MemPageReader.java
        statistics
        TestStatistics.java
        TestColumnDescriptor.java
        mem
        TestMemPageStore.java
        TestMemColumn.java
        TestEncodingStats.java
        values
        dictionary
        IntListTest.java
        TestDictionary.java
        Utils.java
        deltastrings
        TestDeltaByteArray.java
        benchmark
        BenchmarkDeltaByteArray.java
        bloomfilter
        TestBlockSplitBloomFilter.java
        delta
        DeltaBinaryPackingValuesWriterForLongTest.java
        DeltaBinaryPackingValuesWriterForIntegerTest.java
        benchmark
        SmallRangeWritingBenchmarkTest.java
        BenchMarkTest.java
        BenchmarkIntegerOutputSize.java
        RandomWritingBenchmarkTest.java
        BenchmarkReadingRandomIntegers.java
        factory
        DefaultValuesWriterFactoryTest.java
        deltalengthbytearray
        TestDeltaLengthByteArray.java
        benchmark
        BenchmarkDeltaLengthByteArray.java
        bitpacking
        BitPackingPerfTest.java
        TestBitPackingColumn.java
        TestValuesReaderImpl.java
        bytestreamsplit
        ByteStreamSplitValuesReaderTest.java
        ByteStreamSplitValuesWriterTest.java
        ByteStreamSplitValuesEndToEndTest.java
        rle
        TestRunLengthBitPackingHybridEncoder.java
        RunLengthBitPackingHybridIntegrationTest.java
        schema
        TestMessageType.java
        TestTypeBuildersWithLogicalTypes.java
        TestRepetitionType.java
        TestTypeUtil.java
        TestTypeBuilders.java
        TestPrimitiveComparator.java
        TestPrimitiveStringifier.java
        parser
        TestParquetParser.java
  - pom.xml
  - REVIEWERS.md
- .travis.yml
- parquet-scrooge
  - src
    - main
      - java
        org
        apache
        parquet
        scrooge
        ScroogeSchemaConversionException.java
        ScroogeRecordConverter.java
        ScroogeStructConverter.java
        ParquetScroogeScheme.java
        ScroogeReadSupport.java
        ParquetScroogeInputFormat.java
        ParquetScroogeOutputFormat.java
        ScroogeWriteSupport.java
    - test
      - resources
        names.txt
      - java
        org
        apache
        parquet
        scrooge
        ParquetScroogeSchemeTest.java
        ScroogeBinaryTest.java
        ScroogeStructConverterTest.java
        TestCorruptScroogeRecords.java
      - thrift
        test.thrift
  - pom.xml
  - REVIEWERS.md
- README.md
- parquet-jackson
  - src
    - main
      - resources
        META-INF
        LICENSE
  - pom.xml
  - README.md
- KEYS
- changelog.sh
- parquet-thrift
  - src
    - main
      - java
        org
        apache
        parquet
        hadoop
        thrift
        TBaseWriteSupport.java
        AbstractThriftWriteSupport.java
        ThriftToParquetFileWriter.java
        ParquetThriftOutputFormat.java
        ParquetThriftBytesOutputFormat.java
        ThriftBytesWriteSupport.java
        ThriftReadSupport.java
        ThriftWriteSupport.java
        ParquetThriftInputFormat.java
        thrift
        ProtocolReadToWrite.java
        ThriftParquetWriter.java
        ThriftReader.java
        ThriftParquetReader.java
        KeepOnlyFirstPrimitiveFilter.java
        ThriftMetaData.java
        DecodingSchemaMismatchException.java
        ParquetWriteProtocol.java
        FieldIgnoredHandler.java
        ThriftSchemaConverter.java
        struct
        ThriftType.java
        ThriftField.java
        JSON.java
        CompatibilityChecker.java
        CompatibilityRunner.java
        ThriftTypeID.java
        TBaseRecordConverter.java
        pig
        ParquetThriftStorer.java
        TupleToThriftWriteSupport.java
        ParquetProtocol.java
        projection
        FieldsPath.java
        amend
        DefaultEventsVisitor.java
        DefaultProtocolEventsGenerator.java
        ProtocolEventsAmender.java
        ReadFieldBeginProtocol.java
        deprecated
        DeprecatedFieldProjectionFilter.java
        PathGlobPattern.java
        ThriftProjectionException.java
        FieldProjectionFilter.java
        StrictFieldProjectionFilter.java
        BufferedProtocolReadToWrite.java
        ThriftRecordConverter.java
        ParquetReadProtocol.java
        ConvertedField.java
        SkippableException.java
        ThriftSchemaConvertVisitor.java
        ProtocolPipe.java
    - test
      - resources
        org
        apache
        parquet
        hadoop
        thrift
        AddressBook.json
        thrift
        StructWithUnionV1NoStructOrUnionMeta.json
      - java
        org
        apache
        parquet
        hadoop
        thrift
        TestThriftToParquetFileWriter.java
        TestInputOutputFormat.java
        TestCorruptThriftRecords.java
        TestParquetToThriftReadWriteAndProjection.java
        TestBinary.java
        TestArrayCompatibility.java
        thrift
        TestThriftSchemaConverterProjectUnion.java
        TestThriftRecordConverter.java
        TestThriftToPigCompatibility.java
        TestThriftMetaData.java
        struct
        CompatibilityCheckerTest.java
        TestThriftType.java
        TestParquetWriteProtocol.java
        pig
        TestParquetThriftStorer.java
        projection
        deprecated
        PathGlobPatternTest.java
        TestStrictFieldProjectionFilter.java
        TestFieldsPath.java
        TestParquetReadProtocol.java
        TestThriftSchemaConverter.java
        TestThriftParquetReaderWriter.java
        TestProtocolReadToWrite.java
      - thrift
        compat.thrift
        array_compat.thrift
        binary.thrift
        test.thrift
  - pom.xml
  - REVIEWERS.md
- NOTICE
- .gitignore
- parquet-pig
  - src
    - main
      - java
        org
        apache
        parquet
        pig
        ParquetLoader.java
        PigMetaData.java
        TupleReadSupport.java
        PigSchemaConverter.java
        convert
        TupleRecordMaterializer.java
        DecimalUtils.java
        MapConverter.java
        ParentValueContainer.java
        TupleConverter.java
        ParquetStorer.java
        TupleWriteSupport.java
        TupleConversionException.java
        SchemaConversionException.java
        summary
        SummaryData.java
        FieldSummaryData.java
        StringSummaryData.java
        ValueStat.java
        MapSummaryData.java
        BagSummaryData.java
        EnumStat.java
        Summary.java
        TupleSummaryData.java
        NumberSummaryData.java
    - test
      - resources
        log4j.properties
      - java
        org
        apache
        parquet
        pig
        PerfTestReadAllCols.java
        PerfTest2.java
        TestParquetStorer.java
        TestParquetLoader.java
        TestDecimalUtils.java
        TestTupleRecordConsumer.java
        PerfTest.java
        TupleConsumerPerfTest.java
        summary
        TestSummary.java
        TestPigSchemaConverter.java
  - pom.xml
  - REVIEWERS.md
- parquet-arrow
  - src
    - main
      - java
        org
        apache
        parquet
        arrow
        schema
        List3Levels.java
        SchemaConverter.java
        SchemaMapping.java
    - test
      - java
        org
        apache
        parquet
        arrow
        schema
        TestSchemaConverter.java
  - pom.xml
- doc
  - dremel_paper
- parquet-format-structures
  - src
    - main
      - java
        org
        apache
        parquet
        format
        InterningProtocol.java
        event
        FieldConsumer.java
        TypedConsumer.java
        Consumers.java
        EventBasedThriftReader.java
        Util.java
        LogicalTypes.java
    - test
      - java
        org
        apache
        parquet
        format
        TestUtil.java
  - pom.xml
- parquet-hive-bundle
  - src
    - main
      - resources
        META-INF
        LICENSE
        org
        apache
        parquet
        bundle
  - pom.xml

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.hadoop;

import static org.apache.parquet.bytes.BytesUtils.readIntLittleEndian;
import static org.apache.parquet.filter2.compat.RowGroupFilter.FilterLevel.BLOOMFILTER;
import static org.apache.parquet.filter2.compat.RowGroupFilter.FilterLevel.DICTIONARY;
import static org.apache.parquet.filter2.compat.RowGroupFilter.FilterLevel.STATISTICS;
import static org.apache.parquet.format.converter.ParquetMetadataConverter.NO_FILTER;
import static org.apache.parquet.format.converter.ParquetMetadataConverter.SKIP_ROW_GROUPS;
import static org.apache.parquet.hadoop.ColumnIndexFilterUtils.calculateOffsetRanges;
import static org.apache.parquet.hadoop.ColumnIndexFilterUtils.filterOffsetIndex;
import static org.apache.parquet.hadoop.ParquetFileWriter.MAGIC;
import static org.apache.parquet.hadoop.ParquetFileWriter.PARQUET_COMMON_METADATA_FILE;
import static org.apache.parquet.hadoop.ParquetFileWriter.PARQUET_METADATA_FILE;

import java.io.Closeable;
import java.io.InputStream;
import java.io.IOException;
import java.io.SequenceInputStream;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.zip.CRC32;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.HadoopReadOptions;
import org.apache.parquet.ParquetReadOptions;
import org.apache.parquet.bytes.ByteBufferInputStream;
import org.apache.parquet.bytes.BytesInput;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.Encoding;
import org.apache.parquet.column.page.DataPage;
import org.apache.parquet.column.page.DataPageV1;
import org.apache.parquet.column.page.DataPageV2;
import org.apache.parquet.column.page.DictionaryPage;
import org.apache.parquet.column.page.DictionaryPageReadStore;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter;
import org.apache.parquet.column.values.bloomfilter.BloomFilter;
import org.apache.parquet.compression.CompressionCodecFactory.BytesInputDecompressor;
import org.apache.parquet.filter2.compat.FilterCompat;
import org.apache.parquet.filter2.compat.RowGroupFilter;
import org.apache.parquet.format.BloomFilterHeader;
import org.apache.parquet.format.DataPageHeader;
import org.apache.parquet.format.DataPageHeaderV2;
import org.apache.parquet.format.DictionaryPageHeader;
import org.apache.parquet.format.PageHeader;
import org.apache.parquet.format.Util;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.format.converter.ParquetMetadataConverter.MetadataFilter;
import org.apache.parquet.hadoop.ColumnChunkPageReadStore.ColumnChunkPageReader;
import org.apache.parquet.hadoop.ColumnIndexFilterUtils.OffsetRange;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.hadoop.metadata.ColumnPath;
import org.apache.parquet.hadoop.metadata.FileMetaData;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.hadoop.util.HiddenFileFilter;
import org.apache.parquet.hadoop.util.counters.BenchmarkCounter;
import org.apache.parquet.internal.column.columnindex.ColumnIndex;
import org.apache.parquet.internal.column.columnindex.OffsetIndex;
import org.apache.parquet.internal.filter2.columnindex.ColumnIndexFilter;
import org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore;
import org.apache.parquet.internal.filter2.columnindex.RowRanges;
import org.apache.parquet.internal.hadoop.metadata.IndexReference;
import org.apache.parquet.io.InputFile;
import org.apache.parquet.io.ParquetDecodingException;
import org.apache.parquet.io.SeekableInputStream;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.yetus.audience.InterfaceAudience.Private;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Internal implementation of the Parquet file reader as a block container
 */
public class ParquetFileReader implements Closeable {

  private static final Logger LOG = LoggerFactory.getLogger(ParquetFileReader.class);

  public static String PARQUET_READ_PARALLELISM = "parquet.metadata.read.parallelism";

  private final ParquetMetadataConverter converter;

  private final CRC32 crc;

  /**
   * for files provided, check if there's a summary file.
   * If a summary file is found it is used otherwise the file footer is used.
   * @param configuration the hadoop conf to connect to the file system;
   * @param partFiles the part files to read
   * @return the footers for those files using the summary file if possible.
   * @throws IOException if there is an exception while reading footers
   * @deprecated metadata files are not recommended and will be removed in 2.0.0
   */
  @Deprecated
  public static List<Footer> readAllFootersInParallelUsingSummaryFiles(Configuration configuration, List<FileStatus> partFiles) throws IOException {
    return readAllFootersInParallelUsingSummaryFiles(configuration, partFiles, false);
  }

  private static MetadataFilter filter(boolean skipRowGroups) {
    return skipRowGroups ? SKIP_ROW_GROUPS : NO_FILTER;
  }

  /**
   * for files provided, check if there's a summary file.
   * If a summary file is found it is used otherwise the file footer is used.
   * @param configuration the hadoop conf to connect to the file system;
   * @param partFiles the part files to read
   * @param skipRowGroups to skipRowGroups in the footers
   * @return the footers for those files using the summary file if possible.
   * @throws IOException if there is an exception while reading footers
   * @deprecated metadata files are not recommended and will be removed in 2.0.0
   */
  @Deprecated
  public static List<Footer> readAllFootersInParallelUsingSummaryFiles(
      final Configuration configuration,
      final Collection<FileStatus> partFiles,
      final boolean skipRowGroups) throws IOException {

    // figure out list of all parents to part files
    Set<Path> parents = new HashSet<Path>();
    for (FileStatus part : partFiles) {
      parents.add(part.getPath().getParent());
    }

    // read corresponding summary files if they exist
    List<Callable<Map<Path, Footer>>> summaries = new ArrayList<Callable<Map<Path, Footer>>>();
    for (final Path path : parents) {
      summaries.add(() -> {
        ParquetMetadata mergedMetadata = readSummaryMetadata(configuration, path, skipRowGroups);
        if (mergedMetadata != null) {
          final List<Footer> footers;
          if (skipRowGroups) {
            footers = new ArrayList<Footer>();
            for (FileStatus f : partFiles) {
              footers.add(new Footer(f.getPath(), mergedMetadata));
            }
          } else {
            footers = footersFromSummaryFile(path, mergedMetadata);
          }
          Map<Path, Footer> map = new HashMap<Path, Footer>();
          for (Footer footer : footers) {
            // the folder may have been moved
            footer = new Footer(new Path(path, footer.getFile().getName()), footer.getParquetMetadata());
            map.put(footer.getFile(), footer);
          }
          return map;
        } else {
          return Collections.emptyMap();
        }
      });
    }

    Map<Path, Footer> cache = new HashMap<Path, Footer>();
    try {
      List<Map<Path, Footer>> footersFromSummaries = runAllInParallel(configuration.getInt(PARQUET_READ_PARALLELISM, 5), summaries);
      for (Map<Path, Footer> footers : footersFromSummaries) {
        cache.putAll(footers);
      }
    } catch (ExecutionException e) {
      throw new IOException("Error reading summaries", e);
    }

    // keep only footers for files actually requested and read file footer if not found in summaries
    List<Footer> result = new ArrayList<Footer>(partFiles.size());
    List<FileStatus> toRead = new ArrayList<FileStatus>();
    for (FileStatus part : partFiles) {
      Footer f = cache.get(part.getPath());
      if (f != null) {
        result.add(f);
      } else {
        toRead.add(part);
      }
    }

    if (toRead.size() > 0) {
      // read the footers of the files that did not have a summary file
      LOG.info("reading another {} footers", toRead.size());
      result.addAll(readAllFootersInParallel(configuration, toRead, skipRowGroups));
    }

    return result;
  }

  private static <T> List<T> runAllInParallel(int parallelism, List<Callable<T>> toRun) throws ExecutionException {
    LOG.info("Initiating action with parallelism: {}", parallelism);
    ExecutorService threadPool = Executors.newFixedThreadPool(parallelism);
    try {
      List<Future<T>> futures = new ArrayList<Future<T>>();
      for (Callable<T> callable : toRun) {
        futures.add(threadPool.submit(callable));
      }
      List<T> result = new ArrayList<T>(toRun.size());
      for (Future<T> future : futures) {
        try {
          result.add(future.get());
        } catch (InterruptedException e) {
          throw new RuntimeException("The thread was interrupted", e);
        }
      }
      return result;
    } finally {
      threadPool.shutdownNow();
    }
  }

  /**
   * @param configuration the conf to access the File System
   * @param partFiles the files to read
   * @return the footers
   * @throws IOException if an exception was raised while reading footers
   * @deprecated metadata files are not recommended and will be removed in 2.0.0
   */
  @Deprecated
  public static List<Footer> readAllFootersInParallel(final Configuration configuration, List<FileStatus> partFiles) throws IOException {
    return readAllFootersInParallel(configuration, partFiles, false);
  }

  /**
   * read all the footers of the files provided
   * (not using summary files)
   * @param configuration the conf to access the File System
   * @param partFiles the files to read
   * @param skipRowGroups to skip the rowGroup info
   * @return the footers
   * @throws IOException if there is an exception while reading footers
   * @deprecated will be removed in 2.0.0;
   *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
   */
  @Deprecated
  public static List<Footer> readAllFootersInParallel(final Configuration configuration, List<FileStatus> partFiles, final boolean skipRowGroups) throws IOException {
    List<Callable<Footer>> footers = new ArrayList<Callable<Footer>>();
    for (final FileStatus currentFile : partFiles) {
      footers.add(() -> {
        try {
          return new Footer(currentFile.getPath(), readFooter(configuration, currentFile, filter(skipRowGroups)));
        } catch (IOException e) {
          throw new IOException("Could not read footer for file " + currentFile, e);
        }
      });
    }
    try {
      return runAllInParallel(configuration.getInt(PARQUET_READ_PARALLELISM, 5), footers);
    } catch (ExecutionException e) {
      throw new IOException("Could not read footer: " + e.getMessage(), e.getCause());
    }
  }

  /**
   * Read the footers of all the files under that path (recursively)
   * not using summary files.
   *
   * @param configuration a configuration
   * @param fileStatus a file status to recursively list
   * @param skipRowGroups whether to skip reading row group metadata
   * @return a list of footers
   * @throws IOException if an exception is thrown while reading the footers
   * @deprecated will be removed in 2.0.0;
   *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
   */
  @Deprecated
  public static List<Footer> readAllFootersInParallel(Configuration configuration, FileStatus fileStatus, boolean skipRowGroups) throws IOException {
    List<FileStatus> statuses = listFiles(configuration, fileStatus);
    return readAllFootersInParallel(configuration, statuses, skipRowGroups);
  }

  /**
   * Read the footers of all the files under that path (recursively)
   * not using summary files.
   * rowGroups are not skipped
   * @param configuration the configuration to access the FS
   * @param fileStatus the root dir
   * @return all the footers
   * @throws IOException if an exception is thrown while reading the footers
   * @deprecated will be removed in 2.0.0;
   *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
   */
  @Deprecated
  public static List<Footer> readAllFootersInParallel(Configuration configuration, FileStatus fileStatus) throws IOException {
    return readAllFootersInParallel(configuration, fileStatus, false);
  }

  /**
   * @param configuration a configuration
   * @param path a file path
   * @return a list of footers
   * @throws IOException if an exception is thrown while reading the footers
   * @deprecated will be removed in 2.0.0;
   *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
   */
  @Deprecated
  public static List<Footer> readFooters(Configuration configuration, Path path) throws IOException {
    return readFooters(configuration, status(configuration, path));
  }

  private static FileStatus status(Configuration configuration, Path path) throws IOException {
    return path.getFileSystem(configuration).getFileStatus(path);
  }

  /**
   * this always returns the row groups
   * @param configuration a configuration
   * @param pathStatus a file status to read footers from
   * @return a list of footers
   * @throws IOException if an exception is thrown while reading the footers
   * @deprecated will be removed in 2.0.0;
   *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
   */
  @Deprecated
  public static List<Footer> readFooters(Configuration configuration, FileStatus pathStatus) throws IOException {
    return readFooters(configuration, pathStatus, false);
  }

  /**
   * Read the footers of all the files under that path (recursively)
   * using summary files if possible
   * @param configuration the configuration to access the FS
   * @param pathStatus the root dir
   * @param skipRowGroups whether to skip reading row group metadata
   * @return all the footers
   * @throws IOException if an exception is thrown while reading the footers
   * @deprecated will be removed in 2.0.0;
   *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
   */
  @Deprecated
  public static List<Footer> readFooters(Configuration configuration, FileStatus pathStatus, boolean skipRowGroups) throws IOException {
    List<FileStatus> files = listFiles(configuration, pathStatus);
    return readAllFootersInParallelUsingSummaryFiles(configuration, files, skipRowGroups);
  }

  private static List<FileStatus> listFiles(Configuration conf, FileStatus fileStatus) throws IOException {
    if (fileStatus.isDir()) {
      FileSystem fs = fileStatus.getPath().getFileSystem(conf);
      FileStatus[] list = fs.listStatus(fileStatus.getPath(), HiddenFileFilter.INSTANCE);
      List<FileStatus> result = new ArrayList<FileStatus>();
      for (FileStatus sub : list) {
        result.addAll(listFiles(conf, sub));
      }
      return result;
    } else {
      return Arrays.asList(fileStatus);
    }
  }

  /**
   * Specifically reads a given summary file
   * @param configuration a configuration
   * @param summaryStatus file status for a summary file
   * @return the metadata translated for each file
   * @throws IOException if an exception is thrown while reading the summary file
   * @deprecated metadata files are not recommended and will be removed in 2.0.0
   */
  @Deprecated
  public static List<Footer> readSummaryFile(Configuration configuration, FileStatus summaryStatus) throws IOException {
    final Path parent = summaryStatus.getPath().getParent();
    ParquetMetadata mergedFooters = readFooter(configuration, summaryStatus, filter(false));
    return footersFromSummaryFile(parent, mergedFooters);
  }

  static ParquetMetadata readSummaryMetadata(Configuration configuration, Path basePath, boolean skipRowGroups) throws IOException {
    Path metadataFile = new Path(basePath, PARQUET_METADATA_FILE);
    Path commonMetaDataFile = new Path(basePath, PARQUET_COMMON_METADATA_FILE);
    FileSystem fileSystem = basePath.getFileSystem(configuration);
    if (skipRowGroups && fileSystem.exists(commonMetaDataFile)) {
      // reading the summary file that does not contain the row groups
      LOG.info("reading summary file: {}", commonMetaDataFile);
      return readFooter(configuration, commonMetaDataFile, filter(skipRowGroups));
    } else if (fileSystem.exists(metadataFile)) {
      LOG.info("reading summary file: {}", metadataFile);
      return readFooter(configuration, metadataFile, filter(skipRowGroups));
    } else {
      return null;
    }
  }

  static List<Footer> footersFromSummaryFile(final Path parent, ParquetMetadata mergedFooters) {
    Map<Path, ParquetMetadata> footers = new HashMap<Path, ParquetMetadata>();
    List<BlockMetaData> blocks = mergedFooters.getBlocks();
    for (BlockMetaData block : blocks) {
      String path = block.getPath();
      Path fullPath = new Path(parent, path);
      ParquetMetadata current = footers.get(fullPath);
      if (current == null) {
        current = new ParquetMetadata(mergedFooters.getFileMetaData(), new ArrayList<BlockMetaData>());
        footers.put(fullPath, current);
      }
      current.getBlocks().add(block);
    }
    List<Footer> result = new ArrayList<Footer>();
    for (Entry<Path, ParquetMetadata> entry : footers.entrySet()) {
      result.add(new Footer(entry.getKey(), entry.getValue()));
    }
    return result;
  }

  /**
   * Reads the meta data block in the footer of the file
   * @param configuration a configuration
   * @param file the parquet File
   * @return the metadata blocks in the footer
   * @throws IOException if an error occurs while reading the file
   * @deprecated will be removed in 2.0.0;
   *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
   */
  @Deprecated
  public static final ParquetMetadata readFooter(Configuration configuration, Path file) throws IOException {
    return readFooter(configuration, file, NO_FILTER);
  }

  /**
   * Reads the meta data in the footer of the file.
   * Skipping row groups (or not) based on the provided filter
   * @param configuration a configuration
   * @param file the Parquet File
   * @param filter the filter to apply to row groups
   * @return the metadata with row groups filtered.
   * @throws IOException  if an error occurs while reading the file
   * @deprecated will be removed in 2.0.0;
   *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
   */
  public static ParquetMetadata readFooter(Configuration configuration, Path file, MetadataFilter filter) throws IOException {
    return readFooter(HadoopInputFile.fromPath(file, configuration), filter);
  }

  /**
   * @param configuration a configuration
   * @param file the Parquet File
   * @return the metadata with row groups.
   * @throws IOException  if an error occurs while reading the file
   * @deprecated will be removed in 2.0.0;
   *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
   */
  @Deprecated
  public static final ParquetMetadata readFooter(Configuration configuration, FileStatus file) throws IOException {
    return readFooter(configuration, file, NO_FILTER);
  }

  /**
   * Reads the meta data block in the footer of the file
   * @param configuration a configuration
   * @param file the parquet File
   * @param filter the filter to apply to row groups
   * @return the metadata blocks in the footer
   * @throws IOException if an error occurs while reading the file
   * @deprecated will be removed in 2.0.0;
   *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
   */
  @Deprecated
  public static final ParquetMetadata readFooter(Configuration configuration, FileStatus file, MetadataFilter filter) throws IOException {
    return readFooter(HadoopInputFile.fromStatus(file, configuration), filter);
  }

  /**
   * Reads the meta data block in the footer of the file using provided input stream
   * @param file a {@link InputFile} to read
   * @param filter the filter to apply to row groups
   * @return the metadata blocks in the footer
   * @throws IOException if an error occurs while reading the file
   * @deprecated will be removed in 2.0.0;
   *             use {@link ParquetFileReader#open(InputFile, ParquetReadOptions)}
   */
  @Deprecated
  public static final ParquetMetadata readFooter(InputFile file, MetadataFilter filter) throws IOException {
    ParquetReadOptions options;
    if (file instanceof HadoopInputFile) {
      options = HadoopReadOptions.builder(((HadoopInputFile) file).getConfiguration())
          .withMetadataFilter(filter).build();
    } else {
      options = ParquetReadOptions.builder().withMetadataFilter(filter).build();
    }

    try (SeekableInputStream in = file.newStream()) {
      return readFooter(file, options, in);
    }
  }

  private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f) throws IOException {
    ParquetMetadataConverter converter = new ParquetMetadataConverter(options);
    return readFooter(file, options, f, converter);
  }

  private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException {
    long fileLen = file.getLength();
    LOG.debug("File length {}", fileLen);
    int FOOTER_LENGTH_SIZE = 4;
    if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
      throw new RuntimeException(file.toString() + " is not a Parquet file (too small length: " + fileLen + ")");
    }
    long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length;
    LOG.debug("reading footer index at {}", footerLengthIndex);

    f.seek(footerLengthIndex);
    int footerLength = readIntLittleEndian(f);
    byte[] magic = new byte[MAGIC.length];
    f.readFully(magic);
    if (!Arrays.equals(MAGIC, magic)) {
      throw new RuntimeException(file.toString() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
    }
    long footerIndex = footerLengthIndex - footerLength;
    LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex);
    if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
      throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex);
    }
    f.seek(footerIndex);
    // Read all the footer bytes in one time to avoid multiple read operations,
    // since it can be pretty time consuming for a single read operation in HDFS.
    ByteBuffer footerBytesBuffer = ByteBuffer.allocate(footerLength);
    f.readFully(footerBytesBuffer);
    LOG.debug("Finished to read all footer bytes.");
    footerBytesBuffer.flip();
    InputStream footerBytesStream = ByteBufferInputStream.wrap(footerBytesBuffer);
    return converter.readParquetMetadata(footerBytesStream, options.getMetadataFilter());
  }

  /**
   * @param conf a configuration
   * @param file a file path to open
   * @return a parquet file reader
   * @throws IOException if there is an error while opening the file
   * @deprecated will be removed in 2.0.0; use {@link #open(InputFile)}
   */
  @Deprecated
  public static ParquetFileReader open(Configuration conf, Path file) throws IOException {
    return new ParquetFileReader(HadoopInputFile.fromPath(file, conf),
        HadoopReadOptions.builder(conf).build());
  }

  /**
   * @param conf a configuration
   * @param file a file path to open
   * @param filter a metadata filter
   * @return a parquet file reader
   * @throws IOException if there is an error while opening the file
   * @deprecated will be removed in 2.0.0; use {@link #open(InputFile,ParquetReadOptions)}
   */
  @Deprecated
  public static ParquetFileReader open(Configuration conf, Path file, MetadataFilter filter) throws IOException {
    return open(HadoopInputFile.fromPath(file, conf),
        HadoopReadOptions.builder(conf).withMetadataFilter(filter).build());
  }

  /**
   * @param conf a configuration
   * @param file a file path to open
   * @param footer a footer for the file if already loaded
   * @return a parquet file reader
   * @throws IOException if there is an error while opening the file
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public static ParquetFileReader open(Configuration conf, Path file, ParquetMetadata footer) throws IOException {
    return new ParquetFileReader(conf, file, footer);
  }

  /**
   * Open a {@link InputFile file}.
   *
   * @param file an input file
   * @return an open ParquetFileReader
   * @throws IOException if there is an error while opening the file
   */
  public static ParquetFileReader open(InputFile file) throws IOException {
    return new ParquetFileReader(file, ParquetReadOptions.builder().build());
  }

  /**
   * Open a {@link InputFile file} with {@link ParquetReadOptions options}.
   *
   * @param file an input file
   * @param options parquet read options
   * @return an open ParquetFileReader
   * @throws IOException if there is an error while opening the file
   */
  public static ParquetFileReader open(InputFile file, ParquetReadOptions options) throws IOException {
    return new ParquetFileReader(file, options);
  }

  protected final SeekableInputStream f;
  private final InputFile file;
  private final ParquetReadOptions options;
  private final Map<ColumnPath, ColumnDescriptor> paths = new HashMap<>();
  private final FileMetaData fileMetaData; // may be null
  private final List<BlockMetaData> blocks;
  private final List<ColumnIndexStore> blockIndexStores;
  private final List<RowRanges> blockRowRanges;

  // not final. in some cases, this may be lazily loaded for backward-compat.
  private ParquetMetadata footer;

  private int currentBlock = 0;
  private ColumnChunkPageReadStore currentRowGroup = null;
  private DictionaryPageReader nextDictionaryReader = null;

  /**
   * @param configuration the Hadoop conf
   * @param filePath Path for the parquet file
   * @param blocks the blocks to read
   * @param columns the columns to read (their path)
   * @throws IOException if the file can not be opened
   * @deprecated will be removed in 2.0.0.
   */
  @Deprecated
  public ParquetFileReader(Configuration configuration, Path filePath, List<BlockMetaData> blocks,
                           List<ColumnDescriptor> columns) throws IOException {
    this(configuration, null, filePath, blocks, columns);
  }

  /**
   * @param configuration the Hadoop conf
   * @param fileMetaData fileMetaData for parquet file
   * @param filePath Path for the parquet file
   * @param blocks the blocks to read
   * @param columns the columns to read (their path)
   * @throws IOException if the file can not be opened
   * @deprecated will be removed in 2.0.0.
   */
  @Deprecated
  public ParquetFileReader(
      Configuration configuration, FileMetaData fileMetaData,
      Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException {
    this.converter = new ParquetMetadataConverter(configuration);
    this.file = HadoopInputFile.fromPath(filePath, configuration);
    this.fileMetaData = fileMetaData;
    this.f = file.newStream();
    this.options = HadoopReadOptions.builder(configuration).build();
    this.blocks = filterRowGroups(blocks);
    this.blockIndexStores = listWithNulls(this.blocks.size());
    this.blockRowRanges = listWithNulls(this.blocks.size());
    for (ColumnDescriptor col : columns) {
      paths.put(ColumnPath.get(col.getPath()), col);
    }
    this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
  }

  /**
   * @param conf the Hadoop Configuration
   * @param file Path to a parquet file
   * @param filter a {@link MetadataFilter} for selecting row groups
   * @throws IOException if the file can not be opened
   * @deprecated will be removed in 2.0.0.
   */
  @Deprecated
  public ParquetFileReader(Configuration conf, Path file, MetadataFilter filter) throws IOException {
    this(HadoopInputFile.fromPath(file, conf),
        HadoopReadOptions.builder(conf).withMetadataFilter(filter).build());
  }

  /**
   * @param conf the Hadoop Configuration
   * @param file Path to a parquet file
   * @param footer a {@link ParquetMetadata} footer already read from the file
   * @throws IOException if the file can not be opened
   * @deprecated will be removed in 2.0.0.
   */
  @Deprecated
  public ParquetFileReader(Configuration conf, Path file, ParquetMetadata footer) throws IOException {
    this.converter = new ParquetMetadataConverter(conf);
    this.file = HadoopInputFile.fromPath(file, conf);
    this.f = this.file.newStream();
    this.options = HadoopReadOptions.builder(conf).build();
    this.footer = footer;
    this.fileMetaData = footer.getFileMetaData();
    this.blocks = filterRowGroups(footer.getBlocks());
    this.blockIndexStores = listWithNulls(this.blocks.size());
    this.blockRowRanges = listWithNulls(this.blocks.size());
    for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
      paths.put(ColumnPath.get(col.getPath()), col);
    }
    this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
  }

  public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException {
    this.converter = new ParquetMetadataConverter(options);
    this.file = file;
    this.f = file.newStream();
    this.options = options;
    try {
      this.footer = readFooter(file, options, f, converter);
    } catch (Exception e) {
      // In case that reading footer throws an exception in the constructor, the new stream
      // should be closed. Otherwise, there's no way to close this outside.
      f.close();
      throw e;
    }
    this.fileMetaData = footer.getFileMetaData();
    this.blocks = filterRowGroups(footer.getBlocks());
    this.blockIndexStores = listWithNulls(this.blocks.size());
    this.blockRowRanges = listWithNulls(this.blocks.size());
    for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
      paths.put(ColumnPath.get(col.getPath()), col);
    }
    this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
  }

  private static <T> List<T> listWithNulls(int size) {
    return Stream.generate(() -> (T) null).limit(size).collect(Collectors.toCollection(ArrayList<T>::new));
  }

  public ParquetMetadata getFooter() {
    if (footer == null) {
      try {
        // don't read the row groups because this.blocks is always set
        this.footer = readFooter(file, options, f, converter);
      } catch (IOException e) {
        throw new ParquetDecodingException("Unable to read file footer", e);
      }
    }
    return footer;
  }

  public FileMetaData getFileMetaData() {
    if (fileMetaData != null) {
      return fileMetaData;
    }
    return getFooter().getFileMetaData();
  }

  public long getRecordCount() {
    long total = 0;
    for (BlockMetaData block : blocks) {
      total += block.getRowCount();
    }
    return total;
  }

  public long getFilteredRecordCount() {
    if (!options.useColumnIndexFilter()) {
      return getRecordCount();
    }
    long total = 0;
    for (int i = 0, n = blocks.size(); i < n; ++i) {
      total += getRowRanges(i).rowCount();
    }
    return total;
  }

  /**
   * @return the path for this file
   * @deprecated will be removed in 2.0.0; use {@link #getFile()} instead
   */
  @Deprecated
  public Path getPath() {
    return new Path(file.toString());
  }

  public String getFile() {
    return file.toString();
  }

  private List<BlockMetaData> filterRowGroups(List<BlockMetaData> blocks) throws IOException {
    // set up data filters based on configured levels
    List<RowGroupFilter.FilterLevel> levels = new ArrayList<>();

    if (options.useStatsFilter()) {
      levels.add(STATISTICS);
    }

    if (options.useDictionaryFilter()) {
      levels.add(DICTIONARY);
    }

    if (options.useBloomFilter()) {
      levels.add(BLOOMFILTER);
    }

    FilterCompat.Filter recordFilter = options.getRecordFilter();
    if (recordFilter != null) {
      return RowGroupFilter.filterRowGroups(levels, recordFilter, blocks, this);
    }

    return blocks;
  }

  public List<BlockMetaData> getRowGroups() {
    return blocks;
  }

  public void setRequestedSchema(MessageType projection) {
    paths.clear();
    for (ColumnDescriptor col : projection.getColumns()) {
      paths.put(ColumnPath.get(col.getPath()), col);
    }
  }

  public void appendTo(ParquetFileWriter writer) throws IOException {
    writer.appendRowGroups(f, blocks, true);
  }

  /**
   * Reads all the columns requested from the row group at the current file position.
   * @throws IOException if an error occurs while reading
   * @return the PageReadStore which can provide PageReaders for each column.
   */
  public PageReadStore readNextRowGroup() throws IOException {
    if (currentBlock == blocks.size()) {
      return null;
    }
    BlockMetaData block = blocks.get(currentBlock);
    if (block.getRowCount() == 0) {
      throw new RuntimeException("Illegal row group of 0 rows");
    }
    this.currentRowGroup = new ColumnChunkPageReadStore(block.getRowCount());
    // prepare the list of consecutive parts to read them in one scan
    List<ConsecutivePartList> allParts = new ArrayList<ConsecutivePartList>();
    ConsecutivePartList currentParts = null;
    for (ColumnChunkMetaData mc : block.getColumns()) {
      ColumnPath pathKey = mc.getPath();
      BenchmarkCounter.incrementTotalBytes(mc.getTotalSize());
      ColumnDescriptor columnDescriptor = paths.get(pathKey);
      if (columnDescriptor != null) {
        long startingPos = mc.getStartingPos();
        // first part or not consecutive => new list
        if (currentParts == null || currentParts.endPos() != startingPos) {
          currentParts = new ConsecutivePartList(startingPos);
          allParts.add(currentParts);
        }
        currentParts.addChunk(new ChunkDescriptor(columnDescriptor, mc, startingPos, (int)mc.getTotalSize()));
      }
    }
    // actually read all the chunks
    ChunkListBuilder builder = new ChunkListBuilder();
    for (ConsecutivePartList consecutiveChunks : allParts) {
      consecutiveChunks.readAll(f, builder);
    }
    for (Chunk chunk : builder.build()) {
      currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
    }

    // avoid re-reading bytes the dictionary reader is used after this call
    if (nextDictionaryReader != null) {
      nextDictionaryReader.setRowGroup(currentRowGroup);
    }

    advanceToNextBlock();

    return currentRowGroup;
  }

  /**
   * Reads all the columns requested from the row group at the current file position. It may skip specific pages based
   * on the column indexes according to the actual filter. As the rows are not aligned among the pages of the different
   * columns row synchronization might be required. See the documentation of the class SynchronizingColumnReader for
   * details.
   *
   * @return the PageReadStore which can provide PageReaders for each column
   * @throws IOException
   *           if any I/O error occurs while reading
   */
  public PageReadStore readNextFilteredRowGroup() throws IOException {
    if (currentBlock == blocks.size()) {
      return null;
    }
    if (!options.useColumnIndexFilter()) {
      return readNextRowGroup();
    }
    BlockMetaData block = blocks.get(currentBlock);
    if (block.getRowCount() == 0) {
      throw new RuntimeException("Illegal row group of 0 rows");
    }
    ColumnIndexStore ciStore = getColumnIndexStore(currentBlock);
    RowRanges rowRanges = getRowRanges(currentBlock);
    long rowCount = rowRanges.rowCount();
    if (rowCount == 0) {
      // There are no matching rows -> skipping this row-group
      advanceToNextBlock();
      return readNextFilteredRowGroup();
    }
    if (rowCount == block.getRowCount()) {
      // All rows are matching -> fall back to the non-filtering path
      return readNextRowGroup();
    }

    this.currentRowGroup = new ColumnChunkPageReadStore(rowRanges);
    // prepare the list of consecutive parts to read them in one scan
    ChunkListBuilder builder = new ChunkListBuilder();
    List<ConsecutivePartList> allParts = new ArrayList<ConsecutivePartList>();
    ConsecutivePartList currentParts = null;
    for (ColumnChunkMetaData mc : block.getColumns()) {
      ColumnPath pathKey = mc.getPath();
      ColumnDescriptor columnDescriptor = paths.get(pathKey);
      if (columnDescriptor != null) {
        OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath());

        OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex, rowRanges,
            block.getRowCount());
        for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) {
          BenchmarkCounter.incrementTotalBytes(range.getLength());
          long startingPos = range.getOffset();
          // first part or not consecutive => new list
          if (currentParts == null || currentParts.endPos() != startingPos) {
            currentParts = new ConsecutivePartList(startingPos);
            allParts.add(currentParts);
          }
          ChunkDescriptor chunkDescriptor = new ChunkDescriptor(columnDescriptor, mc, startingPos,
              (int) range.getLength());
          currentParts.addChunk(chunkDescriptor);
          builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex);
        }
      }
    }
    // actually read all the chunks
    for (ConsecutivePartList consecutiveChunks : allParts) {
      consecutiveChunks.readAll(f, builder);
    }
    for (Chunk chunk : builder.build()) {
      currentRowGroup.addColumn(chunk.descriptor.col, chunk.readAllPages());
    }

    // avoid re-reading bytes the dictionary reader is used after this call
    if (nextDictionaryReader != null) {
      nextDictionaryReader.setRowGroup(currentRowGroup);
    }

    advanceToNextBlock();

    return currentRowGroup;
  }

  private ColumnIndexStore getColumnIndexStore(int blockIndex) {
    ColumnIndexStore ciStore = blockIndexStores.get(blockIndex);
    if (ciStore == null) {
      ciStore = ColumnIndexStoreImpl.create(this, blocks.get(blockIndex), paths.keySet());
      blockIndexStores.set(blockIndex, ciStore);
    }
    return ciStore;
  }

  private RowRanges getRowRanges(int blockIndex) {
    RowRanges rowRanges = blockRowRanges.get(blockIndex);
    if (rowRanges == null) {
      rowRanges = ColumnIndexFilter.calculateRowRanges(options.getRecordFilter(), getColumnIndexStore(blockIndex),
          paths.keySet(), blocks.get(blockIndex).getRowCount());
      blockRowRanges.set(blockIndex, rowRanges);
    }
    return rowRanges;
  }

  public boolean skipNextRowGroup() {
    return advanceToNextBlock();
  }

  private boolean advanceToNextBlock() {
    if (currentBlock == blocks.size()) {
      return false;
    }

    // update the current block and instantiate a dictionary reader for it
    ++currentBlock;
    this.nextDictionaryReader = null;

    return true;
  }

  /**
   * Returns a {@link DictionaryPageReadStore} for the row group that would be
   * returned by calling {@link #readNextRowGroup()} or skipped by calling
   * {@link #skipNextRowGroup()}.
   *
   * @return a DictionaryPageReadStore for the next row group
   */
  public DictionaryPageReadStore getNextDictionaryReader() {
    if (nextDictionaryReader == null && currentBlock < blocks.size()) {
      this.nextDictionaryReader = getDictionaryReader(blocks.get(currentBlock));
    }
    return nextDictionaryReader;
  }

  public DictionaryPageReader getDictionaryReader(BlockMetaData block) {
    return new DictionaryPageReader(this, block);
  }

  /**
   * Reads and decompresses a dictionary page for the given column chunk.
   *
   * Returns null if the given column chunk has no dictionary page.
   *
   * @param meta a column's ColumnChunkMetaData to read the dictionary from
   * @return an uncompressed DictionaryPage or null
   * @throws IOException if there is an error while reading the dictionary
   */
  DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
    if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
        !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
      return null;
    }

    // TODO: this should use getDictionaryPageOffset() but it isn't reliable.
    if (f.getPos() != meta.getStartingPos()) {
      f.seek(meta.getStartingPos());
    }

    PageHeader pageHeader = Util.readPageHeader(f);
    if (!pageHeader.isSetDictionary_page_header()) {
      return null; // TODO: should this complain?
    }

    DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
    BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());

    return new DictionaryPage(
        decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
        compressedPage.getDictionarySize(),
        compressedPage.getEncoding());
  }

  private DictionaryPage readCompressedDictionary(
      PageHeader pageHeader, SeekableInputStream fin) throws IOException {
    DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header();

    int uncompressedPageSize = pageHeader.getUncompressed_page_size();
    int compressedPageSize = pageHeader.getCompressed_page_size();

    byte [] dictPageBytes = new byte[compressedPageSize];
    fin.readFully(dictPageBytes);

    BytesInput bin = BytesInput.from(dictPageBytes);

    return new DictionaryPage(
        bin, uncompressedPageSize, dictHeader.getNum_values(),
        converter.getEncoding(dictHeader.getEncoding()));
  }

  public BloomFilterReader getBloomFilterDataReader(BlockMetaData block) {
    return new BloomFilterReader(this, block);
  }

  /**
   * Reads Bloom filter data for the given column chunk.
   *
   * @param meta a column's ColumnChunkMetaData to read the dictionary from
   * @return an BloomFilter object.
   * @throws IOException if there is an error while reading the Bloom filter.
   */
  public BloomFilter readBloomFilter(ColumnChunkMetaData meta) throws IOException {
    long bloomFilterOffset = meta.getBloomFilterOffset();
    f.seek(bloomFilterOffset);
    BloomFilterHeader bloomFilterHeader;

    // Read Bloom filter data header.
    try {
      bloomFilterHeader = Util.readBloomFilterHeader(f);
    } catch (IOException e) {
      LOG.warn("read no bloom filter");
      return null;
    }

    int numBytes = bloomFilterHeader.getNumBytes();
    if (numBytes <= 0 || numBytes > BlockSplitBloomFilter.UPPER_BOUND_BYTES) {
      LOG.warn("the read bloom filter size is wrong, size is {}", bloomFilterHeader.getNumBytes());
      return null;
    }

    if (!bloomFilterHeader.getHash().isSetXXHASH() || !bloomFilterHeader.getAlgorithm().isSetBLOCK()
      || !bloomFilterHeader.getCompression().isSetUNCOMPRESSED()) {
      LOG.warn("the read bloom filter is not supported yet,  algorithm = {}, hash = {}, compression = {}",
        bloomFilterHeader.getAlgorithm(), bloomFilterHeader.getHash(), bloomFilterHeader.getCompression());
      return null;
    }

    byte[] bitset = new byte[numBytes];
    f.readFully(bitset);
    return new BlockSplitBloomFilter(bitset);
  }

  /**
   * @param column
   *          the column chunk which the column index is to be returned for
   * @return the column index for the specified column chunk or {@code null} if there is no index
   * @throws IOException
   *           if any I/O error occurs during reading the file
   */
  @Private
  public ColumnIndex readColumnIndex(ColumnChunkMetaData column) throws IOException {
    IndexReference ref = column.getColumnIndexReference();
    if (ref == null) {
      return null;
    }
    f.seek(ref.getOffset());
    return ParquetMetadataConverter.fromParquetColumnIndex(column.getPrimitiveType(), Util.readColumnIndex(f));
  }

  /**
   * @param column
   *          the column chunk which the offset index is to be returned for
   * @return the offset index for the specified column chunk or {@code null} if there is no index
   * @throws IOException
   *           if any I/O error occurs during reading the file
   */
  @Private
  public OffsetIndex readOffsetIndex(ColumnChunkMetaData column) throws IOException {
    IndexReference ref = column.getOffsetIndexReference();
    if (ref == null) {
      return null;
    }
    f.seek(ref.getOffset());
    return ParquetMetadataConverter.fromParquetOffsetIndex(Util.readOffsetIndex(f));
  }

  @Override
  public void close() throws IOException {
    try {
      if (f != null) {
        f.close();
      }
    } finally {
      options.getCodecFactory().release();
    }
  }

  /*
   * Builder to concatenate the buffers of the discontinuous parts for the same column. These parts are generated as a
   * result of the column-index based filtering when some pages might be skipped at reading.
   */
  private class ChunkListBuilder {
    private class ChunkData {
      final List<ByteBuffer> buffers = new ArrayList<>();
      OffsetIndex offsetIndex;
    }

    private final Map<ChunkDescriptor, ChunkData> map = new HashMap<>();
    private ChunkDescriptor lastDescriptor;
    private SeekableInputStream f;

    void add(ChunkDescriptor descriptor, List<ByteBuffer> buffers, SeekableInputStream f) {
      ChunkData data = map.get(descriptor);
      if (data == null) {
        data = new ChunkData();
        map.put(descriptor, data);
      }
      data.buffers.addAll(buffers);

      lastDescriptor = descriptor;
      this.f = f;
    }

    void setOffsetIndex(ChunkDescriptor descriptor, OffsetIndex offsetIndex) {
      ChunkData data = map.get(descriptor);
      if (data == null) {
        data = new ChunkData();
        map.put(descriptor, data);
      }
      data.offsetIndex = offsetIndex;
    }

    List<Chunk> build() {
      List<Chunk> chunks = new ArrayList<>();
      for (Entry<ChunkDescriptor, ChunkData> entry : map.entrySet()) {
        ChunkDescriptor descriptor = entry.getKey();
        ChunkData data = entry.getValue();
        if (descriptor.equals(lastDescriptor)) {
          // because of a bug, the last chunk might be larger than descriptor.size
          chunks.add(new WorkaroundChunk(lastDescriptor, data.buffers, f, data.offsetIndex));
        } else {
          chunks.add(new Chunk(descriptor, data.buffers, data.offsetIndex));
        }
      }
      return chunks;
    }
  }

  /**
   * The data for a column chunk
   */
  private class Chunk {

    protected final ChunkDescriptor descriptor;
    protected final ByteBufferInputStream stream;
    final OffsetIndex offsetIndex;

    /**
     * @param descriptor descriptor for the chunk
     * @param buffers ByteBuffers that contain the chunk
     * @param offsetIndex the offset index for this column; might be null
     */
    public Chunk(ChunkDescriptor descriptor, List<ByteBuffer> buffers, OffsetIndex offsetIndex) {
      this.descriptor = descriptor;
      this.stream = ByteBufferInputStream.wrap(buffers);
      this.offsetIndex = offsetIndex;
    }

    protected PageHeader readPageHeader() throws IOException {
      return Util.readPageHeader(stream);
    }

    /**
     * Calculate checksum of input bytes, throw decoding exception if it does not match the provided
     * reference crc
     */
    private void verifyCrc(int referenceCrc, byte[] bytes, String exceptionMsg) {
      crc.reset();
      crc.update(bytes);
      if (crc.getValue() != ((long) referenceCrc & 0xffffffffL)) {
        throw new ParquetDecodingException(exceptionMsg);
      }
    }

    /**
     * Read all of the pages in a given column chunk.
     * @return the list of pages
     */
    public ColumnChunkPageReader readAllPages() throws IOException {
      List<DataPage> pagesInChunk = new ArrayList<DataPage>();
      DictionaryPage dictionaryPage = null;
      PrimitiveType type = getFileMetaData().getSchema()
          .getType(descriptor.col.getPath()).asPrimitiveType();
      long valuesCountReadSoFar = 0;
      int dataPageCountReadSoFar = 0;
      while (hasMorePages(valuesCountReadSoFar, dataPageCountReadSoFar)) {
        PageHeader pageHeader = readPageHeader();
        int uncompressedPageSize = pageHeader.getUncompressed_page_size();
        int compressedPageSize = pageHeader.getCompressed_page_size();
        final BytesInput pageBytes;
        switch (pageHeader.type) {
          case DICTIONARY_PAGE:
            // there is only one dictionary page per column chunk
            if (dictionaryPage != null) {
              throw new ParquetDecodingException("more than one dictionary page in column " + descriptor.col);
            }
            pageBytes = this.readAsBytesInput(compressedPageSize);
            if (options.usePageChecksumVerification() && pageHeader.isSetCrc()) {
              verifyCrc(pageHeader.getCrc(), pageBytes.toByteArray(),
                "could not verify dictionary page integrity, CRC checksum verification failed");
            }
            DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
            dictionaryPage =
                new DictionaryPage(
                    pageBytes,
                    uncompressedPageSize,
                    dicHeader.getNum_values(),
                    converter.getEncoding(dicHeader.getEncoding())
                    );
            // Copy crc to new page, used for testing
            if (pageHeader.isSetCrc()) {
              dictionaryPage.setCrc(pageHeader.getCrc());
            }
            break;
          case DATA_PAGE:
            DataPageHeader dataHeaderV1 = pageHeader.getData_page_header();
            pageBytes = this.readAsBytesInput(compressedPageSize);
            if (options.usePageChecksumVerification() && pageHeader.isSetCrc()) {
              verifyCrc(pageHeader.getCrc(), pageBytes.toByteArray(),
                "could not verify page integrity, CRC checksum verification failed");
            }
            DataPageV1 dataPageV1 = new DataPageV1(
              pageBytes,
              dataHeaderV1.getNum_values(),
              uncompressedPageSize,
              converter.fromParquetStatistics(
                getFileMetaData().getCreatedBy(),
                dataHeaderV1.getStatistics(),
                type),
              converter.getEncoding(dataHeaderV1.getRepetition_level_encoding()),
              converter.getEncoding(dataHeaderV1.getDefinition_level_encoding()),
              converter.getEncoding(dataHeaderV1.getEncoding()));
            // Copy crc to new page, used for testing
            if (pageHeader.isSetCrc()) {
              dataPageV1.setCrc(pageHeader.getCrc());
            }
            pagesInChunk.add(dataPageV1);
            valuesCountReadSoFar += dataHeaderV1.getNum_values();
            ++dataPageCountReadSoFar;
            break;
          case DATA_PAGE_V2:
            DataPageHeaderV2 dataHeaderV2 = pageHeader.getData_page_header_v2();
            int dataSize = compressedPageSize - dataHeaderV2.getRepetition_levels_byte_length() - dataHeaderV2.getDefinition_levels_byte_length();
            pagesInChunk.add(
                new DataPageV2(
                    dataHeaderV2.getNum_rows(),
                    dataHeaderV2.getNum_nulls(),
                    dataHeaderV2.getNum_values(),
                    this.readAsBytesInput(dataHeaderV2.getRepetition_levels_byte_length()),
                    this.readAsBytesInput(dataHeaderV2.getDefinition_levels_byte_length()),
                    converter.getEncoding(dataHeaderV2.getEncoding()),
                    this.readAsBytesInput(dataSize),
                    uncompressedPageSize,
                    converter.fromParquetStatistics(
                        getFileMetaData().getCreatedBy(),
                        dataHeaderV2.getStatistics(),
                        type),
                    dataHeaderV2.isIs_compressed()
                    ));
            valuesCountReadSoFar += dataHeaderV2.getNum_values();
            ++dataPageCountReadSoFar;
            break;
          default:
            LOG.debug("skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize);
            stream.skipFully(compressedPageSize);
            break;
        }
      }
      if (offsetIndex == null && valuesCountReadSoFar != descriptor.metadata.getValueCount()) {
        // Would be nice to have a CorruptParquetFileException or something as a subclass?
        throw new IOException(
            "Expected " + descriptor.metadata.getValueCount() + " values in column chunk at " +
            getPath() + " offset " + descriptor.metadata.getFirstDataPageOffset() +
            " but got " + valuesCountReadSoFar + " values instead over " + pagesInChunk.size()
            + " pages ending at file offset " + (descriptor.fileOffset + stream.position()));
      }
      BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(descriptor.metadata.getCodec());
      return new ColumnChunkPageReader(decompressor, pagesInChunk, dictionaryPage, offsetIndex,
          blocks.get(currentBlock).getRowCount());
    }

    private boolean hasMorePages(long valuesCountReadSoFar, int dataPageCountReadSoFar) {
      return offsetIndex == null ? valuesCountReadSoFar < descriptor.metadata.getValueCount()
          : dataPageCountReadSoFar < offsetIndex.getPageCount();
    }

    /**
     * @param size the size of the page
     * @return the page
     * @throws IOException if there is an error while reading from the file stream
     */
    public BytesInput readAsBytesInput(int size) throws IOException {
      return BytesInput.from(stream.sliceBuffers(size));
    }

  }

  /**
   * deals with a now fixed bug where compressedLength was missing a few bytes.
   */
  private class WorkaroundChunk extends Chunk {

    private final SeekableInputStream f;

    /**
     * @param descriptor the descriptor of the chunk
     * @param f the file stream positioned at the end of this chunk
     */
    private WorkaroundChunk(ChunkDescriptor descriptor, List<ByteBuffer> buffers, SeekableInputStream f, OffsetIndex offsetIndex) {
      super(descriptor, buffers, offsetIndex);
      this.f = f;
    }

    protected PageHeader readPageHeader() throws IOException {
      PageHeader pageHeader;
      stream.mark(8192); // headers should not be larger than 8k
      try {
        pageHeader = Util.readPageHeader(stream);
      } catch (IOException e) {
        // this is to workaround a bug where the compressedLength
        // of the chunk is missing the size of the header of the dictionary
        // to allow reading older files (using dictionary) we need this.
        // usually 13 to 19 bytes are missing
        // if the last page is smaller than this, the page header itself is truncated in the buffer.
        stream.reset(); // resetting the buffer to the position before we got the error
        LOG.info("completing the column chunk to read the page header");
        pageHeader = Util.readPageHeader(new SequenceInputStream(stream, f)); // trying again from the buffer + remainder of the stream.
      }
      return pageHeader;
    }

    public BytesInput readAsBytesInput(int size) throws IOException {
      int available = stream.available();
      if (size > available) {
        // this is to workaround a bug where the compressedLength
        // of the chunk is missing the size of the header of the dictionary
        // to allow reading older files (using dictionary) we need this.
        // usually 13 to 19 bytes are missing
        int missingBytes = size - available;
        LOG.info("completed the column chunk with {} bytes", missingBytes);

        List<ByteBuffer> buffers = new ArrayList<>();
        buffers.addAll(stream.sliceBuffers(available));

        ByteBuffer lastBuffer = ByteBuffer.allocate(missingBytes);
        f.readFully(lastBuffer);
        buffers.add(lastBuffer);

        return BytesInput.from(buffers);
      }

      return super.readAsBytesInput(size);
    }

  }


  /**
   * Information needed to read a column chunk or a part of it.
   */
  private static class ChunkDescriptor {

    private final ColumnDescriptor col;
    private final ColumnChunkMetaData metadata;
    private final long fileOffset;
    private final int size;

    /**
     * @param col column this chunk is part of
     * @param metadata metadata for the column
     * @param fileOffset offset in the file where this chunk starts
     * @param size size of the chunk
     */
    private ChunkDescriptor(
        ColumnDescriptor col,
        ColumnChunkMetaData metadata,
        long fileOffset,
        int size) {
      super();
      this.col = col;
      this.metadata = metadata;
      this.fileOffset = fileOffset;
      this.size = size;
    }

    @Override
    public int hashCode() {
      return col.hashCode();
    }

    @Override
    public boolean equals(Object obj) {
      if (this == obj) {
        return true;
      } else if (obj instanceof ChunkDescriptor) {
        return col.equals(((ChunkDescriptor) obj).col);
      } else {
        return false;
      }
    }
  }

  /**
   * Describes a list of consecutive parts to be read at once. A consecutive part may contain whole column chunks or
   * only parts of them (some pages).
   */
  private class ConsecutivePartList {

    private final long offset;
    private int length;
    private final List<ChunkDescriptor> chunks = new ArrayList<ChunkDescriptor>();

    /**
     * @param offset where the first chunk starts
     */
    ConsecutivePartList(long offset) {
      this.offset = offset;
    }

    /**
     * adds a chunk to the list.
     * It must be consecutive to the previous chunk
     * @param descriptor a chunk descriptor
     */
    public void addChunk(ChunkDescriptor descriptor) {
      chunks.add(descriptor);
      length += descriptor.size;
    }

    /**
     * @param f file to read the chunks from
     * @param builder used to build chunk list to read the pages for the different columns
     * @throws IOException if there is an error while reading from the stream
     */
    public void readAll(SeekableInputStream f, ChunkListBuilder builder) throws IOException {
      List<Chunk> result = new ArrayList<Chunk>(chunks.size());
      f.seek(offset);

      int fullAllocations = length / options.getMaxAllocationSize();
      int lastAllocationSize = length % options.getMaxAllocationSize();

      int numAllocations = fullAllocations + (lastAllocationSize > 0 ? 1 : 0);
      List<ByteBuffer> buffers = new ArrayList<>(numAllocations);

      for (int i = 0; i < fullAllocations; i += 1) {
        buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize()));
      }

      if (lastAllocationSize > 0) {
        buffers.add(options.getAllocator().allocate(lastAllocationSize));
      }

      for (ByteBuffer buffer : buffers) {
        f.readFully(buffer);
        buffer.flip();
      }

      // report in a counter the data we just scanned
      BenchmarkCounter.incrementBytesRead(length);
      ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffers);
      for (int i = 0; i < chunks.size(); i++) {
        ChunkDescriptor descriptor = chunks.get(i);
        builder.add(descriptor, stream.sliceBuffers(descriptor.size), f);
      }
    }

    /**
     * @return the position following the last byte of these chunks
     */
    public long endPos() {
      return offset + length;
    }

  }

}