org.apache.parquet.hadoop.api.ReadSupport Java Examples

The following examples show how to use org.apache.parquet.hadoop.api.ReadSupport. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PentahoApacheInputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public IPentahoRecordReader createRecordReader( IPentahoInputSplit split ) throws Exception {
  return inClassloader( () -> {
    PentahoInputSplitImpl pentahoInputSplit = (PentahoInputSplitImpl) split;
    InputSplit inputSplit = pentahoInputSplit.getInputSplit();

    ReadSupport<RowMetaAndData> readSupport = new PentahoParquetReadSupport();

    ParquetRecordReader<RowMetaAndData> nativeRecordReader =
      new ParquetRecordReader<>( readSupport, ParquetInputFormat.getFilter( job
        .getConfiguration() ) );
    TaskAttemptContextImpl task = new TaskAttemptContextImpl( job.getConfiguration(), new TaskAttemptID() );
    nativeRecordReader.initialize( inputSplit, task );

    return new PentahoParquetRecordReader( nativeRecordReader );
  } );
}
 
Example #2
Source File: ParquetReader.java    From tajo with Apache License 2.0 6 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      Filter filter) throws IOException {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
  this.conf = conf;

  FileSystem fs = file.getFileSystem(conf);
  List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
  List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
  this.footersIterator = footers.iterator();

  for (Footer footer : footers) {
    for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      totalRowCount += block.getRowCount();
    }
  }
}
 
Example #3
Source File: InternalParquetRecordReader.java    From tajo with Apache License 2.0 6 votes vote down vote up
public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
 
Example #4
Source File: ThriftReadSupport.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public RecordMaterializer<T> prepareForRead(Configuration configuration,
    Map<String, String> keyValueMetaData, MessageType fileSchema,
    org.apache.parquet.hadoop.api.ReadSupport.ReadContext readContext) {
  ThriftMetaData thriftMetaData = ThriftMetaData.fromExtraMetaData(keyValueMetaData);
  try {
    initThriftClass(thriftMetaData, configuration);
  } catch (ClassNotFoundException e) {
    throw new RuntimeException("Cannot find Thrift object class for metadata: " + thriftMetaData, e);
  }

  // if there was not metadata in the file, get it from requested class
  if (thriftMetaData == null) {
    thriftMetaData = ThriftMetaData.fromThriftClass(thriftClass);
  }

  String converterClassName = configuration.get(RECORD_CONVERTER_CLASS_KEY, RECORD_CONVERTER_DEFAULT);
  return getRecordConverterInstance(converterClassName, thriftClass,
      readContext.getRequestedSchema(), thriftMetaData.getDescriptor(),
      configuration);
}
 
Example #5
Source File: InternalParquetRecordReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}
 
Example #6
Source File: GroupReadSupport.java    From iow-hadoop-streaming with Apache License 2.0 6 votes vote down vote up
@Override
public ReadContext init(
        Configuration configuration, Map<String, String> keyValueMetaData,
        MessageType fileSchema) {

    String partialSchemaString;
    String partialSchemaFile = configuration.get(PARQUET_READ_SCHEMA_FILE, "");
    if (!partialSchemaFile.isEmpty()) {
        StringBuilder r = new StringBuilder();
        try {
            BufferedReader br = new BufferedReader(new FileReader(new File(partialSchemaFile)));
            String line;
            while ((line = br.readLine()) != null)
                r.append(line);
        } catch (Exception e) {
            throw new RuntimeException("Can't read schema from file " + partialSchemaFile + ": " + e.getMessage());
        }

        partialSchemaString = r.toString();
    }
    else
        partialSchemaString = configuration.get(ReadSupport.PARQUET_READ_SCHEMA);

    return new ReadContext(getSchemaForRead(fileSchema, partialSchemaString));
}
 
Example #7
Source File: GroupReadSupportTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testInitWithPartialSchema() {
  GroupReadSupport s = new GroupReadSupport();
  Configuration configuration = new Configuration();
  Map<String, String> keyValueMetaData = new HashMap<String, String>();
  MessageType fileSchema = MessageTypeParser.parseMessageType(fullSchemaStr);
  MessageType partialSchema = MessageTypeParser.parseMessageType(partialSchemaStr);
  configuration.set(ReadSupport.PARQUET_READ_SCHEMA, partialSchemaStr);

  ReadSupport.ReadContext context = s.init(configuration, keyValueMetaData, fileSchema);
  assertEquals(context.getRequestedSchema(), partialSchema);
}
 
Example #8
Source File: ParquetAsTextInputFormat.java    From iow-hadoop-streaming with Apache License 2.0 5 votes vote down vote up
public TextRecordReaderWrapper(ParquetInputFormat<SimpleGroup> newInputFormat,
                           InputSplit oldSplit,
                           JobConf oldJobConf,
                           Reporter reporter) throws IOException {

    splitLen = oldSplit.getLength();

    try {
        ReadSupport<SimpleGroup> rs = ParquetInputFormat.getReadSupportInstance(oldJobConf);
        realReader = new ParquetRecordReader<>(rs);
        realReader.initialize(((StreamingParquetInputSplitWrapper)oldSplit).realSplit, oldJobConf, reporter);

        oldJobConf.set("map.input.file",((StreamingParquetInputSplitWrapper)oldSplit).realSplit.getPath().toString());
        oldJobConf.set("mapreduce.map.input.file",((StreamingParquetInputSplitWrapper)oldSplit).realSplit.getPath().toString());

        // read once to gain access to key and value objects
        if (realReader.nextKeyValue()) {

          firstRecord = true;
          valueContainer = new Container<>();
          SimpleGroup v = realReader.getCurrentValue();
          valueContainer.set(v);
          ls = groupToStrings(v);
        } else {

          eof = true;
        }
    } catch (InterruptedException e) {
        Thread.interrupted();
        throw new IOException(e);
    }
}
 
Example #9
Source File: ThriftReadSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) {
  final Configuration configuration = context.getConfiguration();
  final MessageType fileMessageType = context.getFileSchema();
  MessageType requestedProjection = fileMessageType;
  String partialSchemaString = configuration.get(ReadSupport.PARQUET_READ_SCHEMA);

  FieldProjectionFilter projectionFilter = getFieldProjectionFilter(configuration);

  if (partialSchemaString != null && projectionFilter != null) {
    throw new ThriftProjectionException(
        String.format("You cannot provide both a partial schema and field projection filter."
                + "Only one of (%s, %s, %s) should be set.",
            PARQUET_READ_SCHEMA, STRICT_THRIFT_COLUMN_FILTER_KEY, THRIFT_COLUMN_FILTER_KEY));
  }

  //set requestedProjections only when it's specified
  if (partialSchemaString != null) {
    requestedProjection = getSchemaForRead(fileMessageType, partialSchemaString);
  } else if (projectionFilter != null) {
    try {
      initThriftClassFromMultipleFiles(context.getKeyValueMetadata(), configuration);
      requestedProjection =  getProjectedSchema(projectionFilter);
    } catch (ClassNotFoundException e) {
      throw new ThriftProjectionException("can not find thriftClass from configuration", e);
    }
  }

  MessageType schemaForRead = getSchemaForRead(fileMessageType, requestedProjection);
  return new ReadContext(schemaForRead);
}
 
Example #10
Source File: ThriftParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ParquetReader<T> build() throws IOException {
  ReadSupport<T> readSupport;

  if (thriftClass != null) {
    readSupport = new ThriftReadSupport<T>(thriftClass);
  } else {
    readSupport = new ThriftReadSupport<T>();
  }

  return ParquetReader.builder(readSupport, file).withConf(conf).withFilter(filter).build();
}
 
Example #11
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}
 
Example #12
Source File: TestParquetToThriftReadWriteAndProjection.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testThriftOptionalFieldsWithReadProjectionUsingParquetSchema() throws Exception {
  // test with projection
  Configuration conf = new Configuration();
  final String readProjectionSchema = "message AddressBook {\n" +
          "  optional group persons {\n" +
          "    repeated group persons_tuple {\n" +
          "      required group name {\n" +
          "        optional binary first_name;\n" +
          "        optional binary last_name;\n" +
          "      }\n" +
          "      optional int32 id;\n" +
          "    }\n" +
          "  }\n" +
          "}";
  conf.set(ReadSupport.PARQUET_READ_SCHEMA, readProjectionSchema);
  TBase toWrite = new AddressBook(
          Arrays.asList(
                  new Person(
                          new Name("Bob", "Roberts"),
                          0,
                          "[email protected]",
                          Arrays.asList(new PhoneNumber("1234567890")))));

  TBase toRead = new AddressBook(
          Arrays.asList(
                  new Person(
                          new Name("Bob", "Roberts"),
                          0,
                          null,
                          null)));
  shouldDoProjection(conf, toWrite, toRead, AddressBook.class);
}
 
Example #13
Source File: TestColumnIndexFiltering.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private List<User> readUsersWithProjection(Filter filter, MessageType schema, boolean useOtherFiltering, boolean useColumnIndexFilter) throws IOException {
  return PhoneBookWriter.readUsers(ParquetReader.builder(new GroupReadSupport(), file)
      .withFilter(filter)
      .useDictionaryFilter(useOtherFiltering)
      .useStatsFilter(useOtherFiltering)
      .useRecordFilter(useOtherFiltering)
      .useColumnIndexFilter(useColumnIndexFilter)
      .set(ReadSupport.PARQUET_READ_SCHEMA, schema.toString()));
}
 
Example #14
Source File: DataWritableReadSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 *
 * It creates the readContext for Parquet side with the requested schema during the init phase.
 *
 * @param configuration needed to get the wanted columns
 * @param keyValueMetaData // unused
 * @param fileSchema parquet file schema
 * @return the parquet ReadContext
 */
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(final Configuration configuration,
    final Map<String, String> keyValueMetaData, final MessageType fileSchema) {
  final String columns = configuration.get(IOConstants.COLUMNS);
  final Map<String, String> contextMetadata = new HashMap<String, String>();
  if (columns != null) {
    final List<String> listColumns = getColumns(columns);

    final List<Type> typeListTable = new ArrayList<Type>();
    for (final String col : listColumns) {
      // listColumns contains partition columns which are metadata only
      if (fileSchema.containsField(col)) {
        typeListTable.add(fileSchema.getType(col));
      } else {
        // below allows schema evolution
        typeListTable.add(new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, col));
      }
    }
    MessageType tableSchema = new MessageType(TABLE_SCHEMA, typeListTable);
    contextMetadata.put(HIVE_SCHEMA_KEY, tableSchema.toString());

    MessageType requestedSchemaByUser = tableSchema;
    final List<Integer> indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);

    final List<Type> typeListWanted = new ArrayList<Type>();
    for (final Integer idx : indexColumnsWanted) {
      typeListWanted.add(tableSchema.getType(listColumns.get(idx)));
    }
    requestedSchemaByUser = resolveSchemaAccess(new MessageType(fileSchema.getName(),
            typeListWanted), fileSchema, configuration);

    return new ReadContext(requestedSchemaByUser, contextMetadata);
  } else {
    contextMetadata.put(HIVE_SCHEMA_KEY, fileSchema.toString());
    return new ReadContext(fileSchema, contextMetadata);
  }
}
 
Example #15
Source File: AvroParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
protected ReadSupport<T> getReadSupport() {
  if (isReflect) {
    conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, false);
  } else {
    conf.setBoolean(AvroReadSupport.AVRO_COMPATIBILITY, enableCompatibility);
  }
  return new AvroReadSupport<T>(model);
}
 
Example #16
Source File: DataWritableReadSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 *
 * It creates the hive read support to interpret data from parquet to hive
 *
 * @param configuration // unused
 * @param keyValueMetaData string map of metadata
 * @param fileSchema // unused
 * @param readContext containing the requested schema and the schema of the hive table
 * @return Record Materialize for Hive
 */
@Override
public RecordMaterializer<ArrayWritable> prepareForRead(final Configuration configuration,
    final Map<String, String> keyValueMetaData, final MessageType fileSchema,
        final org.apache.parquet.hadoop.api.ReadSupport.ReadContext readContext) {
  final Map<String, String> metadata = readContext.getReadSupportMetadata();
  if (metadata == null) {
    throw new IllegalStateException("ReadContext not initialized properly. " +
      "Don't know the Hive Schema.");
  }
  final MessageType tableSchema = resolveSchemaAccess(MessageTypeParser.
      parseMessageType(metadata.get(HIVE_SCHEMA_KEY)), fileSchema, configuration);

  return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema);
}
 
Example #17
Source File: DeprecatedInputFormatTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void runMapReduceJob(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException {

    final FileSystem fileSystem = parquetPath.getFileSystem(conf);
    fileSystem.delete(parquetPath, true);
    fileSystem.delete(outputPath, true);
    {
      writeJob = new Job(conf, "write");
      TextInputFormat.addInputPath(writeJob, inputPath);
      writeJob.setInputFormatClass(TextInputFormat.class);
      writeJob.setNumReduceTasks(0);
      ExampleOutputFormat.setCompression(writeJob, codec);
      ExampleOutputFormat.setOutputPath(writeJob, parquetPath);
      writeJob.setOutputFormatClass(ExampleOutputFormat.class);
      writeJob.setMapperClass(ReadMapper.class);
      ExampleOutputFormat.setSchema(
              writeJob,
              MessageTypeParser.parseMessageType(
                      writeSchema));
      writeJob.submit();
      waitForJob(writeJob);
    }
    {
      jobConf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
      jobConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, GroupReadSupport.class.getCanonicalName());
      jobConf.setInputFormat(MyDeprecatedInputFormat.class);
      MyDeprecatedInputFormat.setInputPaths(jobConf, parquetPath);
      jobConf.setOutputFormat(org.apache.hadoop.mapred.TextOutputFormat.class);
      org.apache.hadoop.mapred.TextOutputFormat.setOutputPath(jobConf, outputPath);
      jobConf.setMapperClass(DeprecatedWriteMapper.class);
      jobConf.setNumReduceTasks(0);
      mapRedJob = JobClient.runJob(jobConf);
    }
  }
 
Example #18
Source File: TajoReadSupport.java    From tajo with Apache License 2.0 5 votes vote down vote up
/**
 * Initializes the ReadSupport.
 *
 * @param context The InitContext.
 * @return A ReadContext that defines how to read the file.
 */
@Override
public ReadSupport.ReadContext init(InitContext context) {
  if (requestedSchema == null) {
    throw new RuntimeException("requestedSchema is null.");
  }
  MessageType requestedParquetSchema =
    new TajoSchemaConverter().convert(requestedSchema);
  LOG.debug("Reading data with projection:\n" + requestedParquetSchema);
  return new ReadContext(requestedParquetSchema);
}
 
Example #19
Source File: GroupReadSupportTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testInitWithoutSpecifyingRequestSchema() throws Exception {
  GroupReadSupport s = new GroupReadSupport();
  Configuration configuration = new Configuration();
  Map<String, String> keyValueMetaData = new HashMap<String, String>();
  MessageType fileSchema = MessageTypeParser.parseMessageType(fullSchemaStr);

  ReadSupport.ReadContext context = s.init(configuration, keyValueMetaData, fileSchema);
  assertEquals(context.getRequestedSchema(), fileSchema);
}
 
Example #20
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      FilterCompat.Filter filter) throws IOException {
  this(Collections.singletonList((InputFile) HadoopInputFile.fromPath(file, conf)),
      HadoopReadOptions.builder(conf)
          .withRecordFilter(Objects.requireNonNull(filter, "filter cannot be null"))
          .build(),
      readSupport);
}
 
Example #21
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param readSupportClass to instantiate
 * @param <T> the Java type of objects created by the ReadSupport
 * @return the configured read support
 */
@SuppressWarnings("unchecked")
static <T> ReadSupport<T> getReadSupportInstance(
    Class<? extends ReadSupport<T>> readSupportClass){
  try {
    return readSupportClass.newInstance();
  } catch (InstantiationException | IllegalAccessException e) {
    throw new BadConfigurationException("could not instantiate read support class", e);
  }
}
 
Example #22
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}
 
Example #23
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ParquetReader.Builder<Group> createReaderBuilder() throws IOException {
  ReadConfigurator readConfigurator = getReadConfigurator();
  return readConfigurator.configureBuilder(
      new ParquetReader.Builder<Group>(HadoopInputFile.fromPath(file, new Configuration())) {
        @Override
        protected ReadSupport<Group> getReadSupport() {
          return new GroupReadSupport();
        }
      }.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString()));
}
 
Example #24
Source File: GroupReadSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(
    Configuration configuration, Map<String, String> keyValueMetaData,
    MessageType fileSchema) {
  String partialSchemaString = configuration.get(ReadSupport.PARQUET_READ_SCHEMA);
  MessageType requestedProjection = getSchemaForRead(fileSchema, partialSchemaString);
  return new ReadContext(requestedProjection);
}
 
Example #25
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param configuration to find the configuration for the read support
 * @return the configured read support
 * @deprecated use getReadSupportInstance static methods instead
 */
@Deprecated
@SuppressWarnings("unchecked")
ReadSupport<T> getReadSupport(Configuration configuration){
  return getReadSupportInstance(readSupportClass == null ?
      (Class<? extends ReadSupport<T>>) getReadSupportClass(configuration) :
      readSupportClass);
}
 
Example #26
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public RecordReader<Void, T> createRecordReader(
    InputSplit inputSplit,
    TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
  Configuration conf = ContextUtil.getConfiguration(taskAttemptContext);
  ReadSupport<T> readSupport = getReadSupport(conf);
  return new ParquetRecordReader<T>(readSupport, getFilter(conf));
}
 
Example #27
Source File: InternalParquetRecordReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
  // copy custom configuration to the Configuration passed to the ReadSupport
  Configuration conf = new Configuration();
  if (options instanceof HadoopReadOptions) {
    conf = ((HadoopReadOptions) options).getConf();
  }
  for (String property : options.getPropertyNames()) {
    conf.set(property, options.getProperty(property));
  }

  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
  this.filterRecords = options.useRecordFilter();
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}
 
Example #28
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Deprecated
private Builder(ReadSupport<T> readSupport, Path path) {
  this.readSupport = Objects.requireNonNull(readSupport, "readSupport cannot be null");
  this.file = null;
  this.path = Objects.requireNonNull(path, "path cannot be null");
  this.conf = new Configuration();
  this.optionsBuilder = HadoopReadOptions.builder(conf);
}
 
Example #29
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ParquetReader(List<InputFile> files,
                      ParquetReadOptions options,
                      ReadSupport<T> readSupport) throws IOException {
  this.readSupport = readSupport;
  this.options = options;
  this.filesIterator = files.iterator();
}
 
Example #30
Source File: ParquetReader.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public static <T> Builder<T> builder(ReadSupport<T> readSupport, Path path) {
  return new Builder<>(readSupport, path);
}