Java Code Examples for org.apache.parquet.hadoop.api.InitContext

The following examples show how to use org.apache.parquet.hadoop.api.InitContext. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: tajo   Source File: InternalParquetRecordReader.java    License: Apache License 2.0 6 votes vote down vote up
public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
 
Example 2
Source Project: parquet-mr   Source File: InternalParquetRecordReader.java    License: Apache License 2.0 6 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}
 
Example 3
Source Project: parquet-mr   Source File: ParquetInputFormat.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * @param configuration the configuration to connect to the file system
 * @param footers the footers of the files to read
 * @return the splits for the footers
 * @throws IOException if there is an error while reading
 * @deprecated split planning using file footers will be removed
 */
@Deprecated
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
  boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
  final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L));
  if (maxSplitSize < 0 || minSplitSize < 0) {
    throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize);
  }
  GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking);
  ReadContext readContext = getReadSupport(configuration).init(new InitContext(
      configuration,
      globalMetaData.getKeyValueMetaData(),
      globalMetaData.getSchema()));

  return new ClientSideMetadataSplitStrategy().getSplits(
      configuration, footers, maxSplitSize, minSplitSize, readContext);
}
 
Example 4
@Override
public ReadContext init( InitContext context ) {
  String schemaStr = context.getConfiguration().get( ParquetConverter.PARQUET_SCHEMA_CONF_KEY );
  if ( schemaStr == null ) {
    throw new RuntimeException( "Schema not defined in the PentahoParquetSchema key" );
  }

  ParquetInputFieldList schema = ParquetInputFieldList.unmarshall( schemaStr );
  converter = new ParquetConverter( schema.getFields() );

  // get all fields from file's schema
  MessageType fileSchema = context.getFileSchema();
  List<Type> newFields = new ArrayList<>();
  // use only required fields
  for ( IParquetInputField f : schema ) {
    Type origField = fileSchema.getFields().get( fileSchema.getFieldIndex( f.getFormatFieldName() ) );
    newFields.add( origField );
  }
  if ( newFields.isEmpty() ) {
    throw new RuntimeException( "Fields should be declared" );
  }
  MessageType newSchema = new MessageType( fileSchema.getName(), newFields );

  return new ReadContext( newSchema, new HashMap<>() );
}
 
Example 5
Source Project: flink   Source File: ParquetRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}
 
Example 6
Source Project: iceberg   Source File: ParquetReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("deprecation")
public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) {
  // Columns are selected from the Parquet file by taking the read context's message type and
  // matching to the file's columns by full path, so this must select columns by using the path
  // in the file's schema.

  MessageType projection = hasIds(fileSchema) ?
    pruneColumns(fileSchema, expectedSchema) :
    pruneColumnsFallback(fileSchema, expectedSchema);

  // override some known backward-compatibility options
  configuration.set("parquet.strict.typing", "false");
  configuration.set("parquet.avro.add-list-element-records", "false");
  configuration.set("parquet.avro.write-old-list-structure", "false");

  // set Avro schemas in case the reader is Avro
  AvroReadSupport.setRequestedProjection(configuration,
      AvroSchemaUtil.convert(expectedSchema, projection.getName()));
  org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection(
      AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()),
      expectedSchema, ImmutableMap.of());
  AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema));

  // let the context set up read support metadata, but always use the correct projection
  ReadContext context = null;
  if (callInit) {
    try {
      context = wrapped.init(configuration, keyValueMetaData, projection);
    } catch (UnsupportedOperationException e) {
      // try the InitContext version
      context = wrapped.init(new InitContext(
          configuration, makeMultimap(keyValueMetaData), projection));
    }
  }

  return new ReadContext(projection,
      context != null ? context.getReadSupportMetadata() : ImmutableMap.of());
}
 
Example 7
Source Project: tajo   Source File: TajoReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Initializes the ReadSupport.
 *
 * @param context The InitContext.
 * @return A ReadContext that defines how to read the file.
 */
@Override
public ReadSupport.ReadContext init(InitContext context) {
  if (requestedSchema == null) {
    throw new RuntimeException("requestedSchema is null.");
  }
  MessageType requestedParquetSchema =
    new TajoSchemaConverter().convert(requestedSchema);
  LOG.debug("Reading data with projection:\n" + requestedParquetSchema);
  return new ReadContext(requestedParquetSchema);
}
 
Example 8
Source Project: flink   Source File: ParquetRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}
 
Example 9
Source Project: parquet-mr   Source File: InternalParquetRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
  // copy custom configuration to the Configuration passed to the ReadSupport
  Configuration conf = new Configuration();
  if (options instanceof HadoopReadOptions) {
    conf = ((HadoopReadOptions) options).getConf();
  }
  for (String property : options.getPropertyNames()) {
    conf.set(property, options.getProperty(property));
  }

  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
  this.filterRecords = options.useRecordFilter();
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}
 
Example 10
Source Project: parquet-mr   Source File: ThriftReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) {
  final Configuration configuration = context.getConfiguration();
  final MessageType fileMessageType = context.getFileSchema();
  MessageType requestedProjection = fileMessageType;
  String partialSchemaString = configuration.get(ReadSupport.PARQUET_READ_SCHEMA);

  FieldProjectionFilter projectionFilter = getFieldProjectionFilter(configuration);

  if (partialSchemaString != null && projectionFilter != null) {
    throw new ThriftProjectionException(
        String.format("You cannot provide both a partial schema and field projection filter."
                + "Only one of (%s, %s, %s) should be set.",
            PARQUET_READ_SCHEMA, STRICT_THRIFT_COLUMN_FILTER_KEY, THRIFT_COLUMN_FILTER_KEY));
  }

  //set requestedProjections only when it's specified
  if (partialSchemaString != null) {
    requestedProjection = getSchemaForRead(fileMessageType, partialSchemaString);
  } else if (projectionFilter != null) {
    try {
      initThriftClassFromMultipleFiles(context.getKeyValueMetadata(), configuration);
      requestedProjection =  getProjectedSchema(projectionFilter);
    } catch (ClassNotFoundException e) {
      throw new ThriftProjectionException("can not find thriftClass from configuration", e);
    }
  }

  MessageType schemaForRead = getSchemaForRead(fileMessageType, requestedProjection);
  return new ReadContext(schemaForRead);
}
 
Example 11
Source Project: parquet-mr   Source File: ProtoReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public ReadContext init(InitContext context) {
  String requestedProjectionString = context.getConfiguration().get(PB_REQUESTED_PROJECTION);

  if (requestedProjectionString != null && !requestedProjectionString.trim().isEmpty()) {
    MessageType requestedProjection = getSchemaForRead(context.getFileSchema(), requestedProjectionString);
    LOG.debug("Reading data with projection {}", requestedProjection);
    return new ReadContext(requestedProjection);
  } else {
    MessageType fileSchema = context.getFileSchema();
    LOG.debug("Reading data with schema {}", fileSchema);
    return new ReadContext(fileSchema);
  }
}
 
Example 12
Source Project: parquet-mr   Source File: TupleReadSupport.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public ReadContext init(InitContext initContext) {
  Schema pigSchema = getPigSchema(initContext.getConfiguration());
  RequiredFieldList requiredFields = getRequiredFields(initContext.getConfiguration());
  boolean columnIndexAccess = initContext.getConfiguration().getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);

  if (pigSchema == null) {
    return new ReadContext(initContext.getFileSchema());
  } else {

    // project the file schema according to the requested Pig schema
    MessageType parquetRequestedSchema = new PigSchemaConverter(columnIndexAccess).filter(initContext.getFileSchema(), pigSchema, requiredFields);
    return new ReadContext(parquetRequestedSchema);
  }
}
 
Example 13
Source Project: parquet-mr   Source File: TestTupleRecordConsumer.java    License: Apache License 2.0 5 votes vote down vote up
private RecordMaterializer<Tuple> newPigRecordConsumer(String pigSchemaString) throws ParserException {
  TupleReadSupport tupleReadSupport = new TupleReadSupport();
  final Configuration configuration = new Configuration(false);
  MessageType parquetSchema = getMessageType(pigSchemaString);
  final Map<String, String> pigMetaData = pigMetaData(pigSchemaString);
  Map<String, Set<String>> globalMetaData = new HashMap<String, Set<String>>();
  for (Entry<String, String> entry : pigMetaData.entrySet()) {
    globalMetaData.put(entry.getKey(), new HashSet<String>(Arrays.asList(entry.getValue())));
  }
  configuration.set(PARQUET_PIG_SCHEMA, pigSchemaString);
  final ReadContext init = tupleReadSupport.init(new InitContext(configuration, globalMetaData, parquetSchema));
  return tupleReadSupport.prepareForRead(configuration, pigMetaData, parquetSchema, init);
}
 
Example 14
Source Project: flink   Source File: RowReadSupport.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public ReadContext init(InitContext initContext) {
	checkNotNull(initContext, "initContext");
	returnTypeInfo = ParquetSchemaConverter.fromParquetType(initContext.getFileSchema());
	return new ReadContext(initContext.getFileSchema());
}
 
Example 15
Source Project: iceberg   Source File: ParquetReadSupport.java    License: Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("deprecation")
public ReadContext init(Configuration configuration, Map<String, String> keyValueMetaData, MessageType fileSchema) {
  // Columns are selected from the Parquet file by taking the read context's message type and
  // matching to the file's columns by full path, so this must select columns by using the path
  // in the file's schema.

  MessageType projection;
  if (ParquetSchemaUtil.hasIds(fileSchema)) {
    projection = ParquetSchemaUtil.pruneColumns(fileSchema, expectedSchema);
  } else if (nameMapping != null) {
    MessageType typeWithIds = ParquetSchemaUtil.applyNameMapping(fileSchema, nameMapping);
    projection = ParquetSchemaUtil.pruneColumns(typeWithIds, expectedSchema);
  } else {
    projection = ParquetSchemaUtil.pruneColumnsFallback(fileSchema, expectedSchema);
  }

  // override some known backward-compatibility options
  configuration.set("parquet.strict.typing", "false");
  configuration.set("parquet.avro.add-list-element-records", "false");
  configuration.set("parquet.avro.write-old-list-structure", "false");

  // set Avro schemas in case the reader is Avro
  AvroReadSupport.setRequestedProjection(configuration,
      AvroSchemaUtil.convert(expectedSchema, projection.getName()));
  org.apache.avro.Schema avroReadSchema = AvroSchemaUtil.buildAvroProjection(
      AvroSchemaUtil.convert(ParquetSchemaUtil.convert(projection), projection.getName()),
      expectedSchema, ImmutableMap.of());
  AvroReadSupport.setAvroReadSchema(configuration, ParquetAvro.parquetAvroSchema(avroReadSchema));

  // let the context set up read support metadata, but always use the correct projection
  ReadContext context = null;
  if (callInit) {
    try {
      context = wrapped.init(configuration, keyValueMetaData, projection);
    } catch (UnsupportedOperationException e) {
      // try the InitContext version
      context = wrapped.init(new InitContext(
          configuration, makeMultimap(keyValueMetaData), projection));
    }
  }

  return new ReadContext(projection,
      context != null ? context.getReadSupportMetadata() : ImmutableMap.of());
}
 
Example 16
Source Project: flink   Source File: RowReadSupport.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public ReadContext init(InitContext initContext) {
	checkNotNull(initContext, "initContext");
	returnTypeInfo = ParquetSchemaConverter.fromParquetType(initContext.getFileSchema());
	return new ReadContext(initContext.getFileSchema());
}
 
Example 17
@Override
public ReadContext init(InitContext context) {
  return new ReadContext(context.getFileSchema());
}
 
Example 18
Source Project: parquet-mr   Source File: SimpleReadSupport.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public ReadContext init(InitContext context) {
  return new ReadContext(context.getFileSchema());
}
 
Example 19
Source Project: parquet-mr   Source File: TestInputOutputFormat.java    License: Apache License 2.0 4 votes vote down vote up
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) {
  Set<String> counts = context.getKeyValueMetadata().get("my.count");
  assertTrue("counts: " + counts, counts.size() > 0);
  return super.init(context);
}