org.apache.parquet.hadoop.api.InitContext Java Examples

The following examples show how to use org.apache.parquet.hadoop.api.InitContext. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PentahoParquetReadSupport.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public ReadContext init( InitContext context ) {
  String schemaStr = context.getConfiguration().get( ParquetConverter.PARQUET_SCHEMA_CONF_KEY );
  if ( schemaStr == null ) {
    throw new RuntimeException( "Schema not defined in the PentahoParquetSchema key" );
  }

  ParquetInputFieldList schema = ParquetInputFieldList.unmarshall( schemaStr );
  converter = new ParquetConverter( schema.getFields() );

  // get all fields from file's schema
  MessageType fileSchema = context.getFileSchema();
  List<Type> newFields = new ArrayList<>();
  // use only required fields
  for ( IParquetInputField f : schema ) {
    Type origField = fileSchema.getFields().get( fileSchema.getFieldIndex( f.getFormatFieldName() ) );
    newFields.add( origField );
  }
  if ( newFields.isEmpty() ) {
    throw new RuntimeException( "Fields should be declared" );
  }
  MessageType newSchema = new MessageType( fileSchema.getName(), newFields );

  return new ReadContext( newSchema, new HashMap<>() );
}
 
Example #2
Source File: InternalParquetRecordReader.java    From tajo with Apache License 2.0 6 votes vote down vote up
public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
 
Example #3
Source File: InternalParquetRecordReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}
 
Example #4
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param configuration the configuration to connect to the file system
 * @param footers the footers of the files to read
 * @return the splits for the footers
 * @throws IOException if there is an error while reading
 * @deprecated split planning using file footers will be removed
 */
@Deprecated
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
  boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
  final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L));
  if (maxSplitSize < 0 || minSplitSize < 0) {
    throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize);
  }
  GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking);
  ReadContext readContext = getReadSupport(configuration).init(new InitContext(
      configuration,
      globalMetaData.getKeyValueMetaData(),
      globalMetaData.getSchema()));

  return new ClientSideMetadataSplitStrategy().getSplits(
      configuration, footers, maxSplitSize, minSplitSize, readContext);
}
 
Example #5
Source File: TestTupleRecordConsumer.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private RecordMaterializer<Tuple> newPigRecordConsumer(String pigSchemaString) throws ParserException {
  TupleReadSupport tupleReadSupport = new TupleReadSupport();
  final Configuration configuration = new Configuration(false);
  MessageType parquetSchema = getMessageType(pigSchemaString);
  final Map<String, String> pigMetaData = pigMetaData(pigSchemaString);
  Map<String, Set<String>> globalMetaData = new HashMap<String, Set<String>>();
  for (Entry<String, String> entry : pigMetaData.entrySet()) {
    globalMetaData.put(entry.getKey(), new HashSet<String>(Arrays.asList(entry.getValue())));
  }
  configuration.set(PARQUET_PIG_SCHEMA, pigSchemaString);
  final ReadContext init = tupleReadSupport.init(new InitContext(configuration, globalMetaData, parquetSchema));
  return tupleReadSupport.prepareForRead(configuration, pigMetaData, parquetSchema, init);
}
 
Example #6
Source File: ParquetReadSupport.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("deprecation")
public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) {
  // Columns are selected from the Parquet file by taking the read context's message type and
  // matching to the file's columns by full path, so this must select columns by using the path
  // in the file's schema.

  MessageType projection = hasIds(fileSchema) ?
    pruneColumns(fileSchema, expectedSchema) :
    pruneColumnsFallback(fileSchema, expectedSchema);

  // override some known backward-compatibility options
  configuration.set("parquet.strict.typing", "false");
  configuration.set("parquet.avro.add-list-element-records", "false");
  configuration.set("parquet.avro.write-old-list-structure", "false");

  // set Avro schemas in case the reader is Avro
  AvroReadSupport.setRequestedProjection(configuration,
      AvroSchemaUtil.convert(expectedSchema, projection.getName()));
  org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection(
      AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()),
      expectedSchema, ImmutableMap.of());
  AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema));

  // let the context set up read support metadata, but always use the correct projection
  ReadContext context = null;
  if (callInit) {
    try {
      context = wrapped.init(configuration, keyValueMetaData, projection);
    } catch (UnsupportedOperationException e) {
      // try the InitContext version
      context = wrapped.init(new InitContext(
          configuration, makeMultimap(keyValueMetaData), projection));
    }
  }

  return new ReadContext(projection,
      context != null ? context.getReadSupportMetadata() : ImmutableMap.of());
}
 
Example #7
Source File: TajoReadSupport.java    From tajo with Apache License 2.0 5 votes vote down vote up
/**
 * Initializes the ReadSupport.
 *
 * @param context The InitContext.
 * @return A ReadContext that defines how to read the file.
 */
@Override
public ReadSupport.ReadContext init(InitContext context) {
  if (requestedSchema == null) {
    throw new RuntimeException("requestedSchema is null.");
  }
  MessageType requestedParquetSchema =
    new TajoSchemaConverter().convert(requestedSchema);
  LOG.debug("Reading data with projection:\n" + requestedParquetSchema);
  return new ReadContext(requestedParquetSchema);
}
 
Example #8
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}
 
Example #9
Source File: TupleReadSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public ReadContext init(InitContext initContext) {
  Schema pigSchema = getPigSchema(initContext.getConfiguration());
  RequiredFieldList requiredFields = getRequiredFields(initContext.getConfiguration());
  boolean columnIndexAccess = initContext.getConfiguration().getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);

  if (pigSchema == null) {
    return new ReadContext(initContext.getFileSchema());
  } else {

    // project the file schema according to the requested Pig schema
    MessageType parquetRequestedSchema = new PigSchemaConverter(columnIndexAccess).filter(initContext.getFileSchema(), pigSchema, requiredFields);
    return new ReadContext(parquetRequestedSchema);
  }
}
 
Example #10
Source File: ProtoReadSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public ReadContext init(InitContext context) {
  String requestedProjectionString = context.getConfiguration().get(PB_REQUESTED_PROJECTION);

  if (requestedProjectionString != null && !requestedProjectionString.trim().isEmpty()) {
    MessageType requestedProjection = getSchemaForRead(context.getFileSchema(), requestedProjectionString);
    LOG.debug("Reading data with projection {}", requestedProjection);
    return new ReadContext(requestedProjection);
  } else {
    MessageType fileSchema = context.getFileSchema();
    LOG.debug("Reading data with schema {}", fileSchema);
    return new ReadContext(fileSchema);
  }
}
 
Example #11
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}
 
Example #12
Source File: InternalParquetRecordReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
  // copy custom configuration to the Configuration passed to the ReadSupport
  Configuration conf = new Configuration();
  if (options instanceof HadoopReadOptions) {
    conf = ((HadoopReadOptions) options).getConf();
  }
  for (String property : options.getPropertyNames()) {
    conf.set(property, options.getProperty(property));
  }

  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
  this.filterRecords = options.useRecordFilter();
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}
 
Example #13
Source File: ThriftReadSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) {
  final Configuration configuration = context.getConfiguration();
  final MessageType fileMessageType = context.getFileSchema();
  MessageType requestedProjection = fileMessageType;
  String partialSchemaString = configuration.get(ReadSupport.PARQUET_READ_SCHEMA);

  FieldProjectionFilter projectionFilter = getFieldProjectionFilter(configuration);

  if (partialSchemaString != null && projectionFilter != null) {
    throw new ThriftProjectionException(
        String.format("You cannot provide both a partial schema and field projection filter."
                + "Only one of (%s, %s, %s) should be set.",
            PARQUET_READ_SCHEMA, STRICT_THRIFT_COLUMN_FILTER_KEY, THRIFT_COLUMN_FILTER_KEY));
  }

  //set requestedProjections only when it's specified
  if (partialSchemaString != null) {
    requestedProjection = getSchemaForRead(fileMessageType, partialSchemaString);
  } else if (projectionFilter != null) {
    try {
      initThriftClassFromMultipleFiles(context.getKeyValueMetadata(), configuration);
      requestedProjection =  getProjectedSchema(projectionFilter);
    } catch (ClassNotFoundException e) {
      throw new ThriftProjectionException("can not find thriftClass from configuration", e);
    }
  }

  MessageType schemaForRead = getSchemaForRead(fileMessageType, requestedProjection);
  return new ReadContext(schemaForRead);
}
 
Example #14
Source File: SimpleReadSupport.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public ReadContext init(InitContext context) {
  return new ReadContext(context.getFileSchema());
}
 
Example #15
Source File: TestInputOutputFormat.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) {
  Set<String> counts = context.getKeyValueMetadata().get("my.count");
  assertTrue("counts: " + counts, counts.size() > 0);
  return super.init(context);
}
 
Example #16
Source File: ParquetHdfsDataWriterTest.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Override
public ReadContext init(InitContext context) {
  return new ReadContext(context.getFileSchema());
}
 
Example #17
Source File: RowReadSupport.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
public ReadContext init(InitContext initContext) {
	checkNotNull(initContext, "initContext");
	returnTypeInfo = ParquetSchemaConverter.fromParquetType(initContext.getFileSchema());
	return new ReadContext(initContext.getFileSchema());
}
 
Example #18
Source File: ParquetReadSupport.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("deprecation")
public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) {
  // Columns are selected from the Parquet file by taking the read context's message type and
  // matching to the file's columns by full path, so this must select columns by using the path
  // in the file's schema.

  MessageType projection;
  if (ParquetSchemaUtil.hasIds(fileSchema)) {
    projection = ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema);
  } else if (nameMapping != null) {
    MessageType typeWithIds = ParquetSchemaUtil.applyNameMapping(fileSchema, nameMapping);
    projection = ParquetSchemaUtil.pruneColumns(typeWithIds, expectedSchema);
  } else {
    projection = ParquetSchemaUtil.pruneColumnsFallback(fileSchema, expectedSchema);
  }

  // override some known backward-compatibility options
  configuration.set("parquet.strict.typing", "false");
  configuration.set("parquet.avro.add-list-element-records", "false");
  configuration.set("parquet.avro.write-old-list-structure", "false");

  // set Avro schemas in case the reader is Avro
  AvroReadSupport.setRequestedProjection(configuration,
      AvroSchemaUtil.convert(expectedSchema, projection.getName()));
  org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection(
      AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()),
      expectedSchema, ImmutableMap.of());
  AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema));

  // let the context set up read support metadata, but always use the correct projection
  ReadContext context = null;
  if (callInit) {
    try {
      context = wrapped.init(configuration, keyValueMetaData, projection);
    } catch (UnsupportedOperationException e) {
      // try the InitContext version
      context = wrapped.init(new InitContext(
          configuration, makeMultimap(keyValueMetaData), projection));
    }
  }

  return new ReadContext(projection,
      context != null ? context.getReadSupportMetadata() : ImmutableMap.of());
}
 
Example #19
Source File: RowReadSupport.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
public ReadContext init(InitContext initContext) {
	checkNotNull(initContext, "initContext");
	returnTypeInfo = ParquetSchemaConverter.fromParquetType(initContext.getFileSchema());
	return new ReadContext(initContext.getFileSchema());
}