Java Code Examples for org.apache.parquet.hadoop.metadata.FileMetaData#getSchema()

The following examples show how to use org.apache.parquet.hadoop.metadata.FileMetaData#getSchema() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParquetHdfsFileSink.java    From components with Apache License 2.0 6 votes vote down vote up
@Override
protected void mergeOutput(FileSystem fs, String sourceFolder, String targetFile) throws IOException {
    FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
    List<Path> sourceFiles = new ArrayList<>();
    for (FileStatus sourceStatus : sourceStatuses) {
        sourceFiles.add(sourceStatus.getPath());
    }
    FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(sourceFiles, fs.getConf()).getFileMetaData();
    ParquetFileWriter writer = new ParquetFileWriter(fs.getConf(), mergedMeta.getSchema(), new Path(targetFile),
            ParquetFileWriter.Mode.CREATE);
    writer.start();
    for (Path input : sourceFiles) {
        writer.appendFile(fs.getConf(), input);
    }
    writer.end(mergedMeta.getKeyValueMetaData());
}
 
Example 2
Source File: InternalParquetRecordReader.java    From tajo with Apache License 2.0 6 votes vote down vote up
public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}
 
Example 3
Source File: InternalParquetRecordReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}
 
Example 4
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}
 
Example 5
Source File: ParquetFileAccessor.java    From pxf with Apache License 2.0 5 votes vote down vote up
/**
 * Reads the original schema from the parquet file.
 *
 * @param parquetFile the path to the parquet file
 * @param fileSplit   the file split we are accessing
 * @return the original schema from the parquet file
 * @throws IOException when there's an IOException while reading the schema
 */
private MessageType getSchema(Path parquetFile, FileSplit fileSplit) throws IOException {

    final long then = System.nanoTime();
    ParquetMetadataConverter.MetadataFilter filter = ParquetMetadataConverter.range(
            fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength());
    ParquetReadOptions parquetReadOptions = HadoopReadOptions
            .builder(configuration)
            .withMetadataFilter(filter)
            .build();
    HadoopInputFile inputFile = HadoopInputFile.fromPath(parquetFile, configuration);
    try (ParquetFileReader parquetFileReader =
                 ParquetFileReader.open(inputFile, parquetReadOptions)) {
        FileMetaData metadata = parquetFileReader.getFileMetaData();
        if (LOG.isDebugEnabled()) {
            LOG.debug("{}-{}: Reading file {} with {} records in {} RowGroups",
                    context.getTransactionId(), context.getSegmentId(),
                    parquetFile.getName(), parquetFileReader.getRecordCount(),
                    parquetFileReader.getRowGroups().size());
        }
        final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - then);
        LOG.debug("{}-{}: Read schema in {} ms", context.getTransactionId(),
                context.getSegmentId(), millis);
        return metadata.getSchema();
    } catch (Exception e) {
        throw new IOException(e);
    }
}
 
Example 6
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}
 
Example 7
Source File: PruneColumnsCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  List<String> args = options.getArgList();
  Path inputFile = new Path(args.get(0));
  Path outputFile = new Path(args.get(1));
  List<String> cols = args.subList(2, args.size());

  Set<ColumnPath> prunePaths = convertToColumnPaths(cols);

  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, inputFile, ParquetMetadataConverter.NO_FILTER);
  FileMetaData metaData = pmd.getFileMetaData();
  MessageType schema = metaData.getSchema();
  List<String> paths = new ArrayList<>();
  getPaths(schema, paths, null);

  for (String col : cols) {
    if (!paths.contains(col)) {
      LOG.warn("Input column name {} doesn't show up in the schema of file {}", col, inputFile.getName());
    }
  }

  ParquetFileWriter writer = new ParquetFileWriter(conf,
    pruneColumnsInSchema(schema, prunePaths), outputFile, ParquetFileWriter.Mode.CREATE);

  writer.start();
  writer.appendFile(HadoopInputFile.fromPath(inputFile, conf));
  writer.end(metaData.getKeyValueMetaData());
}
 
Example 8
Source File: MergeCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  // Prepare arguments
  List<String> args = options.getArgList();
  List<Path> inputFiles = getInputFiles(args.subList(0, args.size() - 1));
  Path outputFile = new Path(args.get(args.size() - 1));

  // Merge schema and extraMeta
  FileMetaData mergedMeta = mergedMetadata(inputFiles);
  PrintWriter out = new PrintWriter(Main.out, true);

  // Merge data
  ParquetFileWriter writer = new ParquetFileWriter(conf,
          mergedMeta.getSchema(), outputFile, ParquetFileWriter.Mode.CREATE);
  writer.start();
  boolean tooSmallFilesMerged = false;
  for (Path input: inputFiles) {
    if (input.getFileSystem(conf).getFileStatus(input).getLen() < TOO_SMALL_FILE_THRESHOLD) {
      out.format("Warning: file %s is too small, length: %d\n",
        input,
        input.getFileSystem(conf).getFileStatus(input).getLen());
      tooSmallFilesMerged = true;
    }

    writer.appendFile(HadoopInputFile.fromPath(input, conf));
  }

  if (tooSmallFilesMerged) {
    out.println("Warning: you merged too small files. " +
      "Although the size of the merged file is bigger, it STILL contains small row groups, thus you don't have the advantage of big row groups, " +
      "which usually leads to bad query performance!");
  }
  writer.end(mergedMeta.getKeyValueMetaData());
}
 
Example 9
Source File: InternalParquetRecordReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
  // copy custom configuration to the Configuration passed to the ReadSupport
  Configuration conf = new Configuration();
  if (options instanceof HadoopReadOptions) {
    conf = ((HadoopReadOptions) options).getConf();
  }
  for (String property : options.getPropertyNames()) {
    conf.set(property, options.getProperty(property));
  }

  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
  this.filterRecords = options.useRecordFilter();
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}
 
Example 10
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
static GlobalMetaData mergeInto(
    FileMetaData toMerge,
    GlobalMetaData mergedMetadata,
    boolean strict) {
  MessageType schema = null;
  Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>();
  Set<String> createdBy = new HashSet<String>();
  if (mergedMetadata != null) {
    schema = mergedMetadata.getSchema();
    newKeyValues.putAll(mergedMetadata.getKeyValueMetaData());
    createdBy.addAll(mergedMetadata.getCreatedBy());
  }
  if ((schema == null && toMerge.getSchema() != null)
      || (schema != null && !schema.equals(toMerge.getSchema()))) {
    schema = mergeInto(toMerge.getSchema(), schema, strict);
  }
  for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) {
    Set<String> values = newKeyValues.get(entry.getKey());
    if (values == null) {
      values = new LinkedHashSet<String>();
      newKeyValues.put(entry.getKey(), values);
    }
    values.add(entry.getValue());
  }
  createdBy.add(toMerge.getCreatedBy());
  return new GlobalMetaData(
      schema,
      newKeyValues,
      createdBy);
}
 
Example 11
Source File: CheckParquet251Command.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private String check(String file) throws IOException {
  Path path = qualifiedPath(file);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), path, ParquetMetadataConverter.NO_FILTER);

  FileMetaData meta = footer.getFileMetaData();
  String createdBy = meta.getCreatedBy();
  if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
    // create fake metadata that will read corrupt stats and return them
    FileMetaData fakeMeta = new FileMetaData(
        meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);

    // get just the binary columns
    List<ColumnDescriptor> columns = Lists.newArrayList();
    Iterables.addAll(columns, Iterables.filter(
        meta.getSchema().getColumns(),
        new Predicate<ColumnDescriptor>() {
          @Override
          public boolean apply(@Nullable ColumnDescriptor input) {
            return input != null && input.getType() == BINARY;
          }
        }));

    // now check to see if the data is actually corrupt
    ParquetFileReader reader = new ParquetFileReader(getConf(),
        fakeMeta, path, footer.getBlocks(), columns);

    try {
      PageStatsValidator validator = new PageStatsValidator();
      for (PageReadStore pages = reader.readNextRowGroup(); pages != null;
           pages = reader.readNextRowGroup()) {
        validator.validate(columns, pages);
      }
    } catch (BadStatsException e) {
      return e.getMessage();
    }
  }

  return null;
}
 
Example 12
Source File: ColumnIndexValidator.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
  List<ContractViolation> violations = new ArrayList<>();
  try (ParquetFileReader reader = ParquetFileReader.open(file)) {
    FileMetaData meta = reader.getFooter().getFileMetaData();
    MessageType schema = meta.getSchema();
    List<ColumnDescriptor> columns = schema.getColumns();

    List<BlockMetaData> blocks = reader.getFooter().getBlocks();
    int rowGroupNumber = 0;
    PageReadStore rowGroup = reader.readNextRowGroup();
    while (rowGroup != null) {
      ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup,
          new DummyRecordConverter(schema).getRootConverter(), schema, null);
      List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns();
      assert (columnChunks.size() == columns.size());
      for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
        ColumnDescriptor column = columns.get(columnNumber);
        ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
        ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
        if (columnIndex == null) {
          continue;
        }
        ColumnPath columnPath = columnChunk.getPath();
        OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
        List<ByteBuffer> minValues = columnIndex.getMinValues();
        List<ByteBuffer> maxValues = columnIndex.getMaxValues();
        BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
        List<Long> nullCounts = columnIndex.getNullCounts();
        List<Boolean> nullPages = columnIndex.getNullPages();
        long rowNumber = 0;
        ColumnReader columnReader = columnReadStore.getColumnReader(column);
        ByteBuffer prevMinValue = null;
        ByteBuffer prevMaxValue = null;
        for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
          boolean isNullPage = nullPages.get(pageNumber);
          ByteBuffer minValue = minValues.get(pageNumber);
          ByteBuffer maxValue = maxValues.get(pageNumber);
          PageValidator pageValidator = new PageValidator(
              column.getPrimitiveType(),
              rowGroupNumber, columnNumber, columnPath, pageNumber,
              violations, columnReader,
              minValue,
              maxValue,
              prevMinValue,
              prevMaxValue,
              boundaryOrder,
              nullCounts.get(pageNumber),
              isNullPage);
          if (!isNullPage) {
            prevMinValue = minValue;
            prevMaxValue = maxValue;
          }
          long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
          while (rowNumber <= lastRowNumberInPage) {
            pageValidator.validateValuesBelongingToRow();
            ++rowNumber;
          }
          pageValidator.finishPage();
        }
      }
      rowGroup = reader.readNextRowGroup();
      rowGroupNumber++;
    }
  }
  return violations;
}