org.apache.parquet.hadoop.metadata.FileMetaData#getKeyValueMetaData

Source File: InternalParquetRecordReader.java From tajo with Apache License 2.0

6 votes

public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

Source File: MetadataUtils.java From parquet-mr with Apache License 2.0

6 votes

public static void showDetails(PrettyPrintWriter out, FileMetaData meta) {
  out.format("creator: %s%n", meta.getCreatedBy());

  Map<String,String> extra = meta.getKeyValueMetaData();
  if (extra != null) {
    for (Map.Entry<String,String> entry : meta.getKeyValueMetaData().entrySet()) {
      out.print("extra: ");
      out.incrementTabLevel();
      out.format("%s = %s%n", entry.getKey(), entry.getValue());
      out.decrementTabLevel();
    }
  }

  out.println();
  out.format("file schema: %s%n", meta.getSchema().getName());
  out.rule('-');
  showDetails(out, meta.getSchema());
}

Source File: MetadataUtils.java From parquet-mr with Apache License 2.0

6 votes

static void showDetails(PrettyPrintWriter out, FileMetaData meta, boolean showOriginalTypes) {
  out.format("creator: %s%n", meta.getCreatedBy());

  Map<String,String> extra = meta.getKeyValueMetaData();
  if (extra != null) {
    for (Map.Entry<String,String> entry : meta.getKeyValueMetaData().entrySet()) {
      out.print("extra: ");
      out.incrementTabLevel();
      out.format("%s = %s%n", entry.getKey(), entry.getValue());
      out.decrementTabLevel();
    }
  }

  out.println();
  out.format("file schema: %s%n", meta.getSchema().getName());
  out.rule('-');
  showDetails(out, meta.getSchema(), showOriginalTypes);
}

Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0

6 votes

public void initialize(ParquetFileReader reader, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}

Source File: ParquetRecordReader.java From flink with Apache License 2.0

5 votes

public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}

Source File: ParquetRecordReader.java From flink with Apache License 2.0

5 votes

public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}

Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0

5 votes

public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
  // copy custom configuration to the Configuration passed to the ReadSupport
  Configuration conf = new Configuration();
  if (options instanceof HadoopReadOptions) {
    conf = ((HadoopReadOptions) options).getConf();
  }
  for (String property : options.getPropertyNames()) {
    conf.set(property, options.getProperty(property));
  }

  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
  this.filterRecords = options.useRecordFilter();
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}

Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0

5 votes

private String check(String file) throws IOException {
  Path path = qualifiedPath(file);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), path, ParquetMetadataConverter.NO_FILTER);

  FileMetaData meta = footer.getFileMetaData();
  String createdBy = meta.getCreatedBy();
  if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
    // create fake metadata that will read corrupt stats and return them
    FileMetaData fakeMeta = new FileMetaData(
        meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);

    // get just the binary columns
    List<ColumnDescriptor> columns = Lists.newArrayList();
    Iterables.addAll(columns, Iterables.filter(
        meta.getSchema().getColumns(),
        new Predicate<ColumnDescriptor>() {
          @Override
          public boolean apply(@Nullable ColumnDescriptor input) {
            return input != null && input.getType() == BINARY;
          }
        }));

    // now check to see if the data is actually corrupt
    ParquetFileReader reader = new ParquetFileReader(getConf(),
        fakeMeta, path, footer.getBlocks(), columns);

    try {
      PageStatsValidator validator = new PageStatsValidator();
      for (PageReadStore pages = reader.readNextRowGroup(); pages != null;
           pages = reader.readNextRowGroup()) {
        validator.validate(columns, pages);
      }
    } catch (BadStatsException e) {
      return e.getMessage();
    }
  }

  return null;
}

Java Code Examples for org.apache.parquet.hadoop.metadata.FileMetaData#getKeyValueMetaData()