org.apache.parquet.hadoop.CodecFactory Java Examples

The following examples show how to use org.apache.parquet.hadoop.CodecFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
public static void main(String []args) {
  try (final BufferAllocator bufferAllocator = new RootAllocator(VM.getMaxDirectMemory())) {
    final Path tableDir  = Path.of(args[0]);
    final Configuration conf = new Configuration();
    final CompressionCodecFactory codecFactory = CodecFactory.createDirectCodecFactory(conf, new ParquetDirectByteBufferAllocator(bufferAllocator), 0);
    final FileSystem fs = HadoopFileSystem.get(tableDir, conf);
    if (fs.exists(tableDir) && fs.isDirectory(tableDir)) {
      Map<ColumnDescriptor, Path> dictionaryEncodedColumns = createGlobalDictionaries(codecFactory, fs, tableDir, bufferAllocator).getColumnsToDictionaryFiles();
      long version = getDictionaryVersion(fs, tableDir);
      Path dictionaryRootDir = getDictionaryVersionedRootPath(fs, tableDir, version);
      for (ColumnDescriptor columnDescriptor: dictionaryEncodedColumns.keySet()) {
        final VectorContainer data = readDictionary(fs, dictionaryRootDir, columnDescriptor, bufferAllocator);
        System.out.println("Dictionary for column [" + columnDescriptor.toString() + " size " + data.getRecordCount());
        BatchPrinter.printBatch(data);
        data.clear();
      }
    }
  } catch (IOException ioe) {
    logger.error("Failed ", ioe);
  }
}
 
Example #2
Source File: ParquetFormatPlugin.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
public PreviewReader(
    OperatorContext context,
    FileSystem fs,
    FileAttributes attributes
    ) throws IOException {
  super();
  this.context = context;
  this.fs = fs;
  this.attributes = attributes;
  final long maxFooterLen = context.getOptions().getOption(ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR);
  this.streamProvider = new SingleStreamProvider(fs, attributes.getPath(), attributes.size(), maxFooterLen, false, null);
  this.footer = this.streamProvider.getFooter();
  boolean autoCorrectCorruptDates = context.getOptions().getOption(ExecConstants.PARQUET_AUTO_CORRECT_DATES_VALIDATOR) &&
    getConfig().autoCorrectCorruptDates;
  this.dateStatus = ParquetReaderUtility.detectCorruptDates(footer, GroupScan.ALL_COLUMNS, autoCorrectCorruptDates);
  this.schemaHelper = SchemaDerivationHelper.builder()
      .readInt96AsTimeStamp(context.getOptions().getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP_VALIDATOR))
      .dateCorruptionStatus(dateStatus)
      .build();
  this.codec = CodecFactory.createDirectCodecFactory(new Configuration(), new ParquetDirectByteBufferAllocator(context.getAllocator()), 0);

}
 
Example #3
Source File: TestGlobalDictionaryPlan.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
@BeforeClass
public static void setup() throws Exception {
  testRootAllocator = RootAllocatorFactory.newRoot(config);
  testAllocator = testRootAllocator.newChildAllocator("test-glb-dict", 0, testRootAllocator.getLimit());

  testNoResult("alter session set \"store.parquet.enable_dictionary_encoding_binary_type\"=true");
  testNoResult("CREATE TABLE dfs_test.globaldictionary AS SELECT * FROM cp.\"globaldictionary.json\"");
  testNoResult("CREATE TABLE dfs_test.places AS SELECT * FROM cp.\"places.json\"");
  final Configuration conf = new Configuration();
  codec = CodecFactory.createDirectCodecFactory(conf, new ParquetDirectByteBufferAllocator(testAllocator), 0);

  fs = HadoopFileSystem.getLocal(conf);

  tableDirPath1 = Path.of(getDfsTestTmpSchemaLocation() + "/globaldictionary");
  GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, tableDirPath1, testAllocator);

  tableDirPath2 = Path.of(getDfsTestTmpSchemaLocation() + "/places");
  GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, tableDirPath2, testAllocator);
}
 
Example #4
Source File: TestGlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
@Test
public void testLocalDictionaries() throws IOException {
  try (final BufferAllocator bufferAllocator = allocatorRule.newAllocator("test-global-dictionary-builder", 0, Long.MAX_VALUE)) {
    final CompressionCodecFactory codecFactory = CodecFactory.createDirectCodecFactory(conf, new ParquetDirectByteBufferAllocator(bufferAllocator), 0);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries1 =
      LocalDictionariesReader.readDictionaries(fs, tableDirPath.resolve("phonebook1.parquet"), codecFactory);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries2 =
      LocalDictionariesReader.readDictionaries(fs, tableDirPath.resolve("phonebook2.parquet"), codecFactory);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries3 =
      LocalDictionariesReader.readDictionaries(fs, tableDirPath.resolve("phonebook3.parquet"), codecFactory);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries4 =
      LocalDictionariesReader.readDictionaries(fs, partitionDirPath.resolve("phonebook4.parquet"), codecFactory);

    assertEquals(2, dictionaries1.getKey().size()); // name and kind have dictionaries
    assertEquals(1, dictionaries2.getKey().size());
    assertEquals(1, dictionaries3.getKey().size());
    assertEquals(1, dictionaries4.getKey().size());

    assertEquals(0, dictionaries1.getValue().size());
    assertEquals(1, dictionaries2.getValue().size()); // skip name
    assertEquals(1, dictionaries3.getValue().size()); // skip name
    assertEquals(1, dictionaries4.getValue().size()); // skip name
  }
}
 
Example #5
Source File: ParquetRecordReader.java    From Bats with Apache License 2.0 5 votes vote down vote up
public ParquetRecordReader(FragmentContext fragmentContext,
    Path path,
    int rowGroupIndex,
    FileSystem fs,
    CodecFactory codecFactory,
    ParquetMetadata footer,
    List<SchemaPath> columns,
    ParquetReaderUtility.DateCorruptionStatus dateCorruptionStatus) {
  this(fragmentContext, footer.getBlocks().get(rowGroupIndex).getRowCount(), path, rowGroupIndex, fs, codecFactory,
      footer, columns, dateCorruptionStatus);
}
 
Example #6
Source File: ParquetRecordReader.java    From Bats with Apache License 2.0 5 votes vote down vote up
public ParquetRecordReader(
    FragmentContext fragmentContext,
    long numRecordsToRead,
    Path path,
    int rowGroupIndex,
    FileSystem fs,
    CodecFactory codecFactory,
    ParquetMetadata footer,
    List<SchemaPath> columns,
    ParquetReaderUtility.DateCorruptionStatus dateCorruptionStatus) {

  this.name = path;
  this.hadoopPath = path;
  this.fileSystem = fs;
  this.codecFactory = codecFactory;
  this.rowGroupIndex = rowGroupIndex;
  this.footer = footer;
  this.dateCorruptionStatus = dateCorruptionStatus;
  this.fragmentContext = fragmentContext;
  this.numRecordsToRead = initNumRecordsToRead(numRecordsToRead, rowGroupIndex, footer);
  this.useAsyncColReader = fragmentContext.getOptions().getOption(ExecConstants.PARQUET_COLUMNREADER_ASYNC).bool_val;
  this.useAsyncPageReader = fragmentContext.getOptions().getOption(ExecConstants.PARQUET_PAGEREADER_ASYNC).bool_val;
  this.useBufferedReader = fragmentContext.getOptions().getOption(ExecConstants.PARQUET_PAGEREADER_USE_BUFFERED_READ).bool_val;
  this.bufferedReadSize = fragmentContext.getOptions().getOption(ExecConstants.PARQUET_PAGEREADER_BUFFER_SIZE).num_val.intValue();
  this.useFadvise = fragmentContext.getOptions().getOption(ExecConstants.PARQUET_PAGEREADER_USE_FADVISE).bool_val;
  this.readQueueSize = fragmentContext.getOptions().getOption(ExecConstants.PARQUET_PAGEREADER_QUEUE_SIZE).num_val;
  this.enforceTotalSize = fragmentContext.getOptions().getOption(ExecConstants.PARQUET_PAGEREADER_ENFORCETOTALSIZE).bool_val;
  this.useBulkReader = fragmentContext.getOptions().getOption(ExecConstants.PARQUET_FLAT_READER_BULK).bool_val;

  setColumns(columns);
}
 
Example #7
Source File: ParquetRecordReader.java    From Bats with Apache License 2.0 5 votes vote down vote up
public ParquetRecordReader(FragmentContext fragmentContext,
    Path path,
    int rowGroupIndex,
    long numRecordsToRead,
    FileSystem fs,
    CodecFactory codecFactory,
    ParquetMetadata footer,
    List<SchemaPath> columns,
    ParquetReaderUtility.DateCorruptionStatus dateCorruptionStatus) {
  this(fragmentContext, numRecordsToRead, path, rowGroupIndex, fs, codecFactory, footer, columns, dateCorruptionStatus);
}
 
Example #8
Source File: ParquetRecordWriter.java    From Bats with Apache License 2.0 5 votes vote down vote up
public ParquetRecordWriter(FragmentContext context, ParquetWriter writer) throws OutOfMemoryException {
  this.oContext = context.newOperatorContext(writer);
  this.codecFactory = CodecFactory.createDirectCodecFactory(writer.getFormatPlugin().getFsConf(),
      new ParquetDirectByteBufferAllocator(oContext.getAllocator()), pageSize);
  this.partitionColumns = writer.getPartitionColumns();
  this.hasPartitions = partitionColumns != null && partitionColumns.size() > 0;
  this.extraMetaData.put(DRILL_VERSION_PROPERTY, DrillVersionInfo.getVersion());
  this.extraMetaData.put(WRITER_VERSION_PROPERTY, String.valueOf(ParquetWriter.WRITER_VERSION));
  this.storageStrategy = writer.getStorageStrategy() == null ? StorageStrategy.DEFAULT : writer.getStorageStrategy();
  this.cleanUpLocations = Lists.newArrayList();
  this.conf = new Configuration(writer.getFormatPlugin().getFsConf());
}
 
Example #9
Source File: LocalDictionariesReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  try (final BufferAllocator bufferAllocator = new RootAllocator(VM.getMaxDirectMemory())) {
    final Configuration fsConf = new Configuration();
    final FileSystem fs = HadoopFileSystem.getLocal(fsConf);
    final Path filePath = Path.of(args[0]);
    final CompressionCodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fsConf, new ParquetDirectByteBufferAllocator(bufferAllocator), 0);
    final Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries = readDictionaries(fs, filePath, codecFactory);
    for (Map.Entry<ColumnDescriptor, Dictionary> entry :  dictionaries.getLeft().entrySet()) {
      printDictionary(entry.getKey(), entry.getValue());
    }
    System.out.println("Binary columns which are not dictionary encoded: " + dictionaries.getRight());
  } catch (IOException ioe) {
    logger.error("Failed ", ioe);
  }
}
 
Example #10
Source File: UnifiedParquetReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public UnifiedParquetReader(
    OperatorContext context,
    ParquetReaderFactory readerFactory,
    BatchSchema tableSchema,
    ParquetScanProjectedColumns projectedColumns,
    Map<String, GlobalDictionaryFieldInfo> globalDictionaryFieldInfoMap,
    List<ParquetFilterCondition> filterConditions,
    ParquetFilterCreator filterCreator,
    ParquetDictionaryConvertor dictionaryConvertor,
    ParquetDatasetSplitScanXAttr readEntry,
    FileSystem fs,
    ParquetMetadata footer,
    GlobalDictionaries dictionaries,
    SchemaDerivationHelper schemaHelper,
    boolean vectorize,
    boolean enableDetailedTracing,
    boolean supportsColocatedReads,
    InputStreamProvider inputStreamProvider) {
  super();
  this.context = context;
  this.readerFactory = readerFactory;
  this.globalDictionaryFieldInfoMap = globalDictionaryFieldInfoMap;
  this.filterConditions = filterConditions;
  this.filterCreator = filterCreator;
  this.dictionaryConvertor = dictionaryConvertor;
  this.fs = fs;
  this.footer = footer;
  this.readEntry = readEntry;
  this.vectorize = vectorize;
  this.tableSchema = tableSchema;
  this.projectedColumns = projectedColumns;
  this.columnResolver = null;
  this.dictionaries = dictionaries;
  this.codecFactory = CodecFactory.createDirectCodecFactory(new Configuration(), new ParquetDirectByteBufferAllocator(context.getAllocator()), 0);
  this.enableDetailedTracing = enableDetailedTracing;
  this.inputStreamProvider = inputStreamProvider;
  this.schemaHelper = schemaHelper;
  this.supportsColocatedReads = supportsColocatedReads;
  this.ignoreSchemaLearning = false;
}
 
Example #11
Source File: TestTpchDistributedWithGlobalDictionaries.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
@BeforeClass
public static void setup() throws Exception {
  testRootAllocator = RootAllocatorFactory.newRoot(config);
  testAllocator = testRootAllocator.newChildAllocator("test-tpch-distrib", 0, testRootAllocator.getLimit());

  testNoResult("alter session set \"store.parquet.enable_dictionary_encoding_binary_type\"=true");
  final Configuration conf = new Configuration();
  final CompressionCodecFactory codec = CodecFactory.createDirectCodecFactory(conf, new ParquetDirectByteBufferAllocator(testAllocator), 0);

  fs = HadoopFileSystem.getLocal(conf);
  testNoResult("CREATE TABLE dfs_test.tpch_lineitem_gd AS SELECT * FROM cp.\"tpch/lineitem.parquet\"");
  testNoResult("CREATE TABLE dfs_test.tpch_customer_gd AS SELECT * FROM cp.\"tpch/customer.parquet\"");
  testNoResult("CREATE TABLE dfs_test.tpch_part_gd AS SELECT * FROM cp.\"tpch/part.parquet\"");
  testNoResult("CREATE TABLE dfs_test.tpch_partsupp_gd AS SELECT * FROM cp.\"tpch/partsupp.parquet\"");
  testNoResult("CREATE TABLE dfs_test.tpch_region_gd AS SELECT * FROM cp.\"tpch/region.parquet\"");
  testNoResult("CREATE TABLE dfs_test.tpch_nation_gd AS SELECT * FROM cp.\"tpch/nation.parquet\"");
  testNoResult("CREATE TABLE dfs_test.tpch_supplier_gd AS SELECT * FROM cp.\"tpch/supplier.parquet\"");
  testNoResult("CREATE TABLE dfs_test.tpch_orders_gd AS SELECT * FROM cp.\"tpch/orders.parquet\"");

  lineitem = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_lineitem_gd");
  customer = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_customer_gd");
  part = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_part_gd");
  partsupp = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_partsupp_gd");
  region = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_region_gd");
  nation = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_nation_gd");
  supplier = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_supplier_gd");
  orders = Path.of(getDfsTestTmpSchemaLocation() + "/tpch_orders_gd");

  GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, lineitem, testAllocator);
  GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, customer, testAllocator);
  GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, part, testAllocator);
  GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, partsupp, testAllocator);
  GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, region, testAllocator);
  GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, nation, testAllocator);
  GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, supplier, testAllocator);
  GlobalDictionaryBuilder.createGlobalDictionaries(codec, fs, orders, testAllocator);
  disableGlobalDictionary();
}
 
Example #12
Source File: HadoopCodecs.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public static CompressionCodecFactory newDirectFactory(Configuration conf, ByteBufferAllocator allocator, int sizeHint) {
  return CodecFactory.createDirectCodecFactory(conf, allocator, sizeHint);
}
 
Example #13
Source File: HadoopCodecs.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public static CompressionCodecFactory newFactory(Configuration conf, int sizeHint) {
  return new CodecFactory(conf, sizeHint);
}
 
Example #14
Source File: HadoopCodecs.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public static CompressionCodecFactory newFactory(int sizeHint) {
  return new CodecFactory(new Configuration(), sizeHint);
}
 
Example #15
Source File: ParquetRecordWriter.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
public ParquetRecordWriter(OperatorContext context, ParquetWriter writer, ParquetFormatConfig config) throws OutOfMemoryException{
  this.context = context;
  this.codecAllocator = context.getAllocator().newChildAllocator("ParquetCodecFactory", 0, Long.MAX_VALUE);
  this.columnEncoderAllocator = context.getAllocator().newChildAllocator("ParquetColEncoder", 0, Long.MAX_VALUE);
  this.codecFactory = CodecFactory.createDirectCodecFactory(new Configuration(),
      new ParquetDirectByteBufferAllocator(codecAllocator), pageSize);
  this.extraMetaData.put(DREMIO_VERSION_PROPERTY, DremioVersionInfo.getVersion());
  this.extraMetaData.put(IS_DATE_CORRECT_PROPERTY, "true");

  this.plugin = writer.getFormatPlugin().getFsPlugin();
  this.queryUser = writer.getProps().getUserName();

  FragmentHandle handle = context.getFragmentHandle();
  String fragmentId = String.format("%d_%d", handle.getMajorFragmentId(), handle.getMinorFragmentId());

  this.location = writer.getLocation();
  this.prefix = fragmentId;
  this.extension = config.outputExtension;
  if (writer.getOptions() != null) {
    this.partitionColumns = writer.getOptions().getPartitionColumns();
    this.isIcebergWriter = (writer.getOptions().getIcebergWriterOperation() != WriterOptions.IcebergWriterOperation.NONE);
  } else {
    this.partitionColumns = null;
    this.isIcebergWriter = false;
  }

  if (this.isIcebergWriter && writer.getOptions().getExtendedProperty() != null) {
    initIcebergColumnIDList(writer.getOptions().getExtendedProperty());
  }

  memoryThreshold = (int) context.getOptions().getOption(ExecConstants.PARQUET_MEMORY_THRESHOLD_VALIDATOR);
  blockSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_BLOCK_SIZE_VALIDATOR);
  pageSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_PAGE_SIZE_VALIDATOR);
  final String codecName = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE_VALIDATOR).toLowerCase();
  switch(codecName) {
  case "snappy":
    codec = CompressionCodecName.SNAPPY;
    break;
  case "lzo":
    codec = CompressionCodecName.LZO;
    break;
  case "gzip":
    codec = CompressionCodecName.GZIP;
    break;
  case "none":
  case "uncompressed":
    codec = CompressionCodecName.UNCOMPRESSED;
    break;
  default:
    throw new UnsupportedOperationException(String.format("Unknown compression type: %s", codecName));
  }

  enableDictionary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_VALIDATOR);
  enableDictionaryForBinary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_BINARY_TYPE_VALIDATOR);
  maxPartitions = context.getOptions().getOption(ExecConstants.PARQUET_MAXIMUM_PARTITIONS_VALIDATOR);
  minRecordsForFlush = context.getOptions().getOption(ExecConstants.PARQUET_MIN_RECORDS_FOR_FLUSH_VALIDATOR);
  parquetFileWriteTimeThresholdMilliSecs = (int)context.getOptions().getOption(ExecConstants.PARQUET_WRITE_TIME_THRESHOLD_MILLI_SECS_VALIDATOR);
  parquetFileWriteIoRateThresholdMbps = context.getOptions().getOption(ExecConstants.PARQUET_WRITE_IO_RATE_THRESHOLD_MBPS_VALIDATOR);
}
 
Example #16
Source File: ParquetFormatDatasetAccessor.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
/**
 * Read the records in the first parquet file to generate schema for selected parquet files
 *
 * @param selection parquet file selection
 * @param fs        file system wrapper
 * @return schema of selected parquet files
 */
private BatchSchema getBatchSchemaFromReader(final FileSelection selection, final FileSystem fs) throws Exception {
  final SabotContext context = ((ParquetFormatPlugin) formatPlugin).getContext();

  try (
      BufferAllocator sampleAllocator = context.getAllocator().newChildAllocator("sample-alloc", 0, Long.MAX_VALUE);
      OperatorContextImpl operatorContext = new OperatorContextImpl(context.getConfig(), sampleAllocator, context.getOptionManager(), 1000);
      SampleMutator mutator = new SampleMutator(sampleAllocator)
  ) {
    final CompressionCodecFactory codec = CodecFactory.createDirectCodecFactory(new Configuration(),
        new ParquetDirectByteBufferAllocator(operatorContext.getAllocator()), 0);
    for (FileAttributes firstFile : selection.getFileAttributesList()) {
      ParquetMetadata footer = SingletonParquetFooterCache.readFooter(fsPlugin.getSystemUserFS(), firstFile, ParquetMetadataConverter.NO_FILTER,
        context.getOptionManager().getOption(ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR));

      if (footer.getBlocks().size() == 0) {
        continue;
      }

      final boolean autoCorrectCorruptDates = context.getOptionManager().getOption(ExecConstants.PARQUET_AUTO_CORRECT_DATES_VALIDATOR) &&
        ((ParquetFormatPlugin) formatPlugin).getConfig().autoCorrectCorruptDates;
      final ParquetReaderUtility.DateCorruptionStatus dateStatus = ParquetReaderUtility.detectCorruptDates(footer, GroupScan.ALL_COLUMNS,
          autoCorrectCorruptDates);
      final SchemaDerivationHelper schemaHelper = SchemaDerivationHelper.builder()
          .readInt96AsTimeStamp(operatorContext.getOptions().getOption(PARQUET_READER_INT96_AS_TIMESTAMP).getBoolVal())
          .dateCorruptionStatus(dateStatus)
          .build();

      boolean isAccelerator = fsPlugin.getId().getName().equals(ACCELERATOR_STORAGEPLUGIN_NAME);

      final ImplicitFilesystemColumnFinder finder = new ImplicitFilesystemColumnFinder(context.getOptionManager(), fs, GroupScan.ALL_COLUMNS, isAccelerator);

      final long maxFooterLen = context.getOptionManager().getOption(ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR);
      try (InputStreamProvider streamProvider = new SingleStreamProvider(fs, firstFile.getPath(), firstFile.size(), maxFooterLen, false, null);
          RecordReader reader = new AdditionalColumnsRecordReader(new ParquetRowiseReader(operatorContext, footer, 0,
               firstFile.getPath().toString(), ParquetScanProjectedColumns.fromSchemaPaths(GroupScan.ALL_COLUMNS),
               fs, schemaHelper, streamProvider, codec, true), finder.getImplicitFieldsForSample(selection))) {

        reader.setup(mutator);

        mutator.allocate(100);
        // Read the parquet file to populate inner list types
        reader.next();

        mutator.getContainer().buildSchema(BatchSchema.SelectionVectorMode.NONE);
        return mutator.getContainer().getSchema();
      }
    }
  } catch (Exception e) {
    throw e;
  }

  throw UserException.dataReadError().message("Only empty parquet files found.").build(logger);
}
 
Example #17
Source File: ParquetRecordReader.java    From Bats with Apache License 2.0 4 votes vote down vote up
public CodecFactory getCodecFactory() {
  return codecFactory;
}