Java Code Examples for org.apache.parquet.hadoop.ParquetFileReader#readFooter()

The following examples show how to use org.apache.parquet.hadoop.ParquetFileReader#readFooter() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}
 
Example 2
Source File: Schemas.java    From kite with Apache License 2.0 6 votes vote down vote up
public static Schema fromParquet(FileSystem fs, Path location) throws IOException {
  ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), location);

  String schemaString = footer.getFileMetaData()
      .getKeyValueMetaData().get("parquet.avro.schema");
  if (schemaString == null) {
    // try the older property
    schemaString = footer.getFileMetaData()
        .getKeyValueMetaData().get("avro.schema");
  }

  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter()
        .convert(footer.getFileMetaData().getSchema());
  }
}
 
Example 3
Source File: TransCompressionCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);
  List<String> args = options.getArgList();
  Path inPath = new Path(args.get(0));
  Path outPath = new Path(args.get(1));
  CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2));

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}
 
Example 4
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void convertCompression(Configuration conf, String inputFile, String outputFile, String codec) throws IOException {
  Path inPath = new Path(inputFile);
  Path outPath = new Path(outputFile);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}
 
Example 5
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testPruneNestedParentColumn() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove parent column. All of it's children will be removed.
  String cargs[] = {inputFile, outputFile, "Links"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 3);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Name");
  assertEquals(fields.get(2).getName(), "Gender");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Links");
  validateColumns(inputFile, prunePaths);
}
 
Example 6
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testPruneMultiColumns() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove columns
  String cargs[] = {inputFile, outputFile, "Name", "Gender"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 2);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Links");
  List<Type> subFields = fields.get(1).asGroupType().getFields();
  assertEquals(subFields.size(), 2);
  assertEquals(subFields.get(0).getName(), "Backward");
  assertEquals(subFields.get(1).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Name", "Gender");
  validateColumns(inputFile, prunePaths);
}
 
Example 7
Source File: TestThriftToParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ParquetReader<Group> createRecordReader(Path parquetFilePath) throws IOException {
  Configuration configuration = new Configuration(true);

  GroupReadSupport readSupport = new GroupReadSupport();
  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
  MessageType schema = readFooter.getFileMetaData().getSchema();

  readSupport.init(configuration, null, schema);
  return new ParquetReader<Group>(parquetFilePath, readSupport);
}
 
Example 8
Source File: TestParquetWriter.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public void runTestAndValidate(String selection, String validationSelection, String inputTable, String outputFile, boolean sort) throws Exception {
  try {
    deleteTableIfExists(outputFile);
    test("use dfs_test");
//    test("ALTER SESSION SET \"planner.add_producer_consumer\" = false");
    String query = select(selection, inputTable, sort);
    System.out.println(outputFile);
    String create = "CREATE TABLE " + outputFile + " AS " + query;
    String validateQuery = select(validationSelection, outputFile, sort);
    test(create);
    test(validateQuery); // TODO: remove
    testBuilder()
        .unOrdered()
        .sqlQuery(validateQuery)
        .sqlBaselineQuery(query)
        .go();

    Configuration hadoopConf = new Configuration();
    Path output = new Path(getDfsTestTmpSchemaLocation(), outputFile);
    FileSystem fs = output.getFileSystem(hadoopConf);
    for (FileStatus file : fs.listStatus(output)) {
      ParquetMetadata footer = ParquetFileReader.readFooter(hadoopConf, file, SKIP_ROW_GROUPS);
      String version = footer.getFileMetaData().getKeyValueMetaData().get(DREMIO_VERSION_PROPERTY);
      assertEquals(DremioVersionInfo.getVersion(), version);
      PageHeaderUtil.validatePageHeaders(file.getPath(), footer);
    }
  } finally {
    deleteTableIfExists(outputFile);
  }
}
 
Example 9
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static List<PageReadStore> readBlocksFromFile(Path file) throws IOException {
  List<PageReadStore> rowGroups = new ArrayList<PageReadStore>();

  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, file, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader fileReader = new ParquetFileReader(configuration, metadata.getFileMetaData(), file, metadata.getBlocks(),
      metadata.getFileMetaData().getSchema().getColumns());

  PageReadStore group;
  while ((group = fileReader.readNextRowGroup()) != null) {
    rowGroups.add(group);
  }

  return rowGroups;
}
 
Example 10
Source File: ParquetRecordReaderWrapper.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * gets a ParquetInputSplit corresponding to a split given by Hive
 *
 * @param oldSplit The split given by Hive
 * @param conf The JobConf of the Hive job
 * @return a ParquetInputSplit corresponding to the oldSplit
 * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
 */
protected ParquetInputSplit getSplit(
    final InputSplit oldSplit,
    final JobConf conf
    ) throws IOException {
  if (oldSplit instanceof FileSplit) {
    FileSplit fileSplit = (FileSplit) oldSplit;
    final long splitStart = fileSplit.getStart();
    final long splitLength = fileSplit.getLength();
    final Path finalPath = fileSplit.getPath();
    final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());

    final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
    final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
    final ReadContext readContext =
        new DataWritableReadSupport()
          .init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());

    schemaSize = MessageTypeParser.parseMessageType(
          readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)
        ).getFieldCount();
     return new ParquetInputSplit(
              finalPath,
              splitStart,
              splitStart + splitLength,
              splitLength,
              fileSplit.getLocations(),
              null);
  } else {
    throw new IllegalArgumentException("Unknown split type: " + oldSplit);
  }
}
 
Example 11
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testPruneNestedColumn() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove nested column
  String cargs[] = {inputFile, outputFile, "Links.Backward"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 4);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Name");
  assertEquals(fields.get(2).getName(), "Gender");
  assertEquals(fields.get(3).getName(), "Links");
  List<Type> subFields = fields.get(3).asGroupType().getFields();
  assertEquals(subFields.size(), 1);
  assertEquals(subFields.get(0).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Links.Backward");
  validateColumns(inputFile, prunePaths);
}
 
Example 12
Source File: CheckParquet251Command.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private String check(String file) throws IOException {
  Path path = qualifiedPath(file);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), path, ParquetMetadataConverter.NO_FILTER);

  FileMetaData meta = footer.getFileMetaData();
  String createdBy = meta.getCreatedBy();
  if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
    // create fake metadata that will read corrupt stats and return them
    FileMetaData fakeMeta = new FileMetaData(
        meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);

    // get just the binary columns
    List<ColumnDescriptor> columns = Lists.newArrayList();
    Iterables.addAll(columns, Iterables.filter(
        meta.getSchema().getColumns(),
        new Predicate<ColumnDescriptor>() {
          @Override
          public boolean apply(@Nullable ColumnDescriptor input) {
            return input != null && input.getType() == BINARY;
          }
        }));

    // now check to see if the data is actually corrupt
    ParquetFileReader reader = new ParquetFileReader(getConf(),
        fakeMeta, path, footer.getBlocks(), columns);

    try {
      PageStatsValidator validator = new PageStatsValidator();
      for (PageReadStore pages = reader.readNextRowGroup(); pages != null;
           pages = reader.readNextRowGroup()) {
        validator.validate(columns, pages);
      }
    } catch (BadStatsException e) {
      return e.getMessage();
    }
  }

  return null;
}
 
Example 13
Source File: CatCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];

  ParquetReader<SimpleRecord> reader = null;
  try {
    PrintWriter writer = new PrintWriter(Main.out, true);
    reader = ParquetReader.builder(new SimpleReadSupport(), new Path(input)).build();
    ParquetMetadata metadata = ParquetFileReader.readFooter(new Configuration(), new Path(input));
    JsonRecordFormatter.JsonGroupFormatter formatter = JsonRecordFormatter.fromSchema(metadata.getFileMetaData().getSchema());

    for (SimpleRecord value = reader.read(); value != null; value = reader.read()) {
      if (options.hasOption('j')) {
        writer.write(formatter.formatRecord(value));
      } else {
        value.prettyPrint(writer);
      }
      writer.println();
    }
  } finally {
    if (reader != null) {
      try {
        reader.close();
      } catch (Exception ex) {
      }
    }
  }
}
 
Example 14
Source File: PruneColumnsCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  List<String> args = options.getArgList();
  Path inputFile = new Path(args.get(0));
  Path outputFile = new Path(args.get(1));
  List<String> cols = args.subList(2, args.size());

  Set<ColumnPath> prunePaths = convertToColumnPaths(cols);

  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, inputFile, ParquetMetadataConverter.NO_FILTER);
  FileMetaData metaData = pmd.getFileMetaData();
  MessageType schema = metaData.getSchema();
  List<String> paths = new ArrayList<>();
  getPaths(schema, paths, null);

  for (String col : cols) {
    if (!paths.contains(col)) {
      LOG.warn("Input column name {} doesn't show up in the schema of file {}", col, inputFile.getName());
    }
  }

  ParquetFileWriter writer = new ParquetFileWriter(conf,
    pruneColumnsInSchema(schema, prunePaths), outputFile, ParquetFileWriter.Mode.CREATE);

  writer.start();
  writer.appendFile(HadoopInputFile.fromPath(inputFile, conf));
  writer.end(metaData.getKeyValueMetaData());
}
 
Example 15
Source File: ColumnSizeCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException {
  Map<String, Long> colSizes = new HashMap<>();
  ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER);

  for (BlockMetaData block : pmd.getBlocks()) {
    for (ColumnChunkMetaData column : block.getColumns()) {
      String colName = column.getPath().toDotString();
      colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L));
    }
  }

  return colSizes;
}
 
Example 16
Source File: ParquetMetadataCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() >= 1,
      "A Parquet file is required.");
  Preconditions.checkArgument(targets.size() == 1,
      "Cannot process multiple Parquet files.");

  String source = targets.get(0);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER);

  console.info("\nFile path:  {}", source);
  console.info("Created by: {}", footer.getFileMetaData().getCreatedBy());

  Map<String, String> kv = footer.getFileMetaData().getKeyValueMetaData();
  if (kv != null && !kv.isEmpty()) {
    console.info("Properties:");
    String format = "  %" + maxSize(kv.keySet()) + "s: %s";
    for (Map.Entry<String, String> entry : kv.entrySet()) {
      console.info(String.format(format, entry.getKey(), entry.getValue()));
    }
  } else {
    console.info("Properties: (none)");
  }

  MessageType schema = footer.getFileMetaData().getSchema();
  console.info("Schema:\n{}", schema);

  List<BlockMetaData> rowGroups = footer.getBlocks();
  for (int index = 0, n = rowGroups.size(); index < n; index += 1) {
    printRowGroup(console, index, rowGroups.get(index), schema);
  }

  console.info("");

  return 0;
}
 
Example 17
Source File: ParquetUtils.java    From hudi with Apache License 2.0 5 votes vote down vote up
public static ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) {
  ParquetMetadata footer;
  try {
    // TODO(vc): Should we use the parallel reading version here?
    footer = ParquetFileReader.readFooter(FSUtils.getFs(parquetFilePath.toString(), conf).getConf(), parquetFilePath);
  } catch (IOException e) {
    throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e);
  }
  return footer;
}
 
Example 18
Source File: TableSchemaResolver.java    From hudi with Apache License 2.0 5 votes vote down vote up
/**
 * Read the parquet schema from a parquet File.
 */
public MessageType readSchemaFromBaseFile(Path parquetFilePath) throws IOException {
  LOG.info("Reading schema from " + parquetFilePath);

  FileSystem fs = metaClient.getRawFs();
  if (!fs.exists(parquetFilePath)) {
    throw new IllegalArgumentException(
        "Failed to read schema from data file " + parquetFilePath + ". File does not exist.");
  }
  ParquetMetadata fileFooter =
      ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER);
  return fileFooter.getFileMetaData().getSchema();
}
 
Example 19
Source File: AllAvroTypesIT.java    From datacollector with Apache License 2.0 4 votes vote down vote up
@Test
public void testType() throws Exception {
  File inputFile = new File(getInputDir(), "input.avro");

  Schema avroSchema = Schema.parse("{" +
    "\"type\": \"record\", " +
    "\"name\": \"RandomRecord\", " +
    "\"fields\": [" +
    "{\"name\": \"value\", \"type\": " + type + " }" +
    "]" +
    "}");

  List<Map<String, Object>> data = ImmutableList.of(Collections.singletonMap("value", value));

  generateAvroFile(avroSchema, inputFile, data);

  AvroConversionCommonConfig commonConfig = new AvroConversionCommonConfig();
  AvroParquetConfig conf = new AvroParquetConfig();
  commonConfig.inputFile = inputFile.getAbsolutePath();
  commonConfig.outputDirectory = getOutputDir();

  MapReduceExecutor executor = generateExecutor(commonConfig, conf, Collections.emptyMap());

  ExecutorRunner runner = new ExecutorRunner.Builder(MapReduceDExecutor.class, executor)
    .setOnRecordError(OnRecordError.TO_ERROR)
    .build();
  runner.runInit();

  Record record = RecordCreator.create();
  record.set(Field.create(Collections.<String, Field>emptyMap()));

  runner.runWrite(ImmutableList.of(record));
  Assert.assertEquals(0, runner.getErrorRecords().size());
  runner.runDestroy();

  Path outputFile = new Path(getOutputDir(), "input.parquet");

  // Ensure that the parquet file have proper structure - not sure how to get required information including logical
  // types from Parquet APIs easily, so we're doing string matching.
  ParquetMetadata readFooter = ParquetFileReader.readFooter(new Configuration(), outputFile);
  String metadataString = readFooter.toString();
  for(String expectedFragment : expectedSchemaStrings) {
    Assert.assertTrue(Utils.format("Missing fragment {} in schema string {}", expectedFragment, metadataString), metadataString.contains(expectedFragment));
  }

  // Validate that content is the same as input
  // Skip for TIMESTAMP_MILLIS as it is not yet supported by Avro to Parquet hadoop libraries (it is supported for parquet in both Hive and Impala when directly using parquet in hadoop libs)
  // TODO: Remove if condition to run if body when timestamp fully supported between avro and parquet
  if (!TIMESTAMP_MILLIS.toString().equals(type)) {
    validateParquetFile(outputFile, data);
  }
}
 
Example 20
Source File: ParquetReader.java    From reef with Apache License 2.0 3 votes vote down vote up
/**
 * Retrieve avro schema from parquet file.
 * @param configuration Hadoop configuration.
 * @param filter Filter for Avro metadata.
 * @return avro schema from parquet file.
 * @throws IOException if the Avro schema couldn't be parsed from the parquet file.
 */
private Schema createAvroSchema(final Configuration configuration, final MetadataFilter filter) throws IOException {
  final ParquetMetadata footer = ParquetFileReader.readFooter(configuration, parquetFilePath, filter);
  final AvroSchemaConverter converter = new AvroSchemaConverter();
  final MessageType schema = footer.getFileMetaData().getSchema();
  return converter.convert(schema);
}