Java Code Examples for org.apache.parquet.hadoop.metadata.ParquetMetadata

The following examples show how to use org.apache.parquet.hadoop.metadata.ParquetMetadata. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: Bats   Source File: ParquetReaderUtility.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Map full schema paths in format `a`.`b`.`c` to respective SchemaElement objects.
 *
 * @param footer Parquet file metadata
 * @return       schema full path to SchemaElement map
 */
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  Map<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);

  Iterator<SchemaElement> iter = fileMetaData.getSchema().iterator();

  // First element in collection is default `root` element. We skip it to maintain key in `a` format instead of `root`.`a`,
  // and thus to avoid the need to cut it out again when comparing with SchemaPath string representation
  if (iter.hasNext()) {
    iter.next();
  }
  while (iter.hasNext()) {
    addSchemaElementMapping(iter, new StringBuilder(), schemaElements);
  }
  return schemaElements;
}
 
Example 2
@Override
public List<IParquetInputField> readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    Configuration conf = job.getConfiguration();
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf );
    Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
    FileSystem fs = FileSystem.get( filePath.toUri(), conf );
    FileStatus fileStatus = fs.getFileStatus( filePath );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new ArrayList<>();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.buildInputFields( schema );
    }
  } );
}
 
Example 3
Source Project: parquet-mr   Source File: CompressionConverter.java    License: Apache License 2.0 6 votes vote down vote up
public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema,
                           String createdBy, CompressionCodecName codecName) throws IOException {
  int blockIndex = 0;
  PageReadStore store = reader.readNextRowGroup();
  while (store != null) {
    writer.startBlock(store.getRowCount());
    BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex);
    List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns();
    Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(
      Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
      ColumnChunkMetaData chunk = columnsInOrder.get(i);
      ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy);
      ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath());
      writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName);
      processChunk(reader, writer, chunk, createdBy, codecName);
      writer.endColumn();
    }
    writer.endBlock();
    store = reader.readNextRowGroup();
    blockIndex++;
  }
}
 
Example 4
@Before
public void setup() throws IOException {
    protoParquetWriterWithOffset = mock(ProtoParquetWriterWithOffset.class);
    hiveClient = mock(HiveClient.class);

    when(protoParquetWriterWithOffset.getEventName()).thenReturn(eventName);
    when(protoParquetWriterWithOffset.getFinalHdfsDir()).thenReturn(finalPath);
    ProtoParquetWriter<Message> writerMock = mock(ProtoParquetWriter.class);
    when(protoParquetWriterWithOffset.getWriter()).thenReturn(writerMock);
    ParquetMetadata parquetMetadata = mock(ParquetMetadata.class);
    when(writerMock.getFooter()).thenReturn(parquetMetadata);

    PrimitiveType appId = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.BINARY, "app_id");
    schema = new MessageType("fs", appId);
    FileMetaData fileMetaData = new FileMetaData(schema, new HashMap<String, String>(), "test");
    when(parquetMetadata.getFileMetaData()).thenReturn(fileMetaData);

    when(protoParquetWriterWithOffset.getDayStartTime()).thenReturn(LocalDateTime.of(2019, 9, 10, 10, 10, 10));
}
 
Example 5
Source Project: dremio-oss   Source File: ParquetReaderUtility.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
public static List<Integer> getRowGroupNumbersFromFileSplit(final long splitStart, final long splitLength,
                                                             final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();
  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}
 
Example 6
Source Project: parquet-mr   Source File: TestParquetWriterAppendBlocks.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testFailDroppingColumns() throws IOException {
  MessageType droppedColumnSchema = Types.buildMessage()
      .required(BINARY).as(UTF8).named("string")
      .named("AppendTest");

  final ParquetMetadata footer = ParquetFileReader.readFooter(
      CONF, file1, NO_FILTER);
  final FSDataInputStream incoming = file1.getFileSystem(CONF).open(file1);

  Path droppedColumnFile = newTemp();
  final ParquetFileWriter writer = new ParquetFileWriter(
      CONF, droppedColumnSchema, droppedColumnFile);
  writer.start();

  TestUtils.assertThrows("Should complain that id column is dropped",
      IllegalArgumentException.class,
    (Callable<Void>) () -> {
      writer.appendRowGroups(incoming, footer.getBlocks(), false);
      return null;
    });
}
 
Example 7
Source Project: dremio-oss   Source File: TestParquetReader.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testArrowSchema210InFooter() throws Exception {
  URL parquet210 = getClass().getResource("/dremio-region-210.parquet");
  Path filePath210 = Path.of(parquet210.toURI());
  ParquetMetadata parquetMetadata210 =
    SingletonParquetFooterCache.readFooter(localFs, filePath210, ParquetMetadataConverter.NO_FILTER,
      ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  Map<String, String> metadata210 = parquetMetadata210.getFileMetaData().getKeyValueMetaData();

  // should not have DREMIO_ARROW_SCHEMA field, but should have DREMIO_ARROW_SCHEMA_2_1
  assertFalse(metadata210.containsKey(DREMIO_ARROW_SCHEMA));
  assertTrue(metadata210.containsKey(DREMIO_ARROW_SCHEMA_2_1));

  Schema schema210 = DremioArrowSchema.fromMetaData(metadata210);

  assertNotNull(schema210);
}
 
Example 8
Source Project: dremio-oss   Source File: TestParquetReader.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testArrowSchemaOldInFooter() throws Exception {
  URL badparquet = getClass().getResource("/types.parquet");

  Path filePathBad = Path.of(badparquet.toURI());
  ParquetMetadata parquetMetadataBad =
    SingletonParquetFooterCache.readFooter(localFs, filePathBad, ParquetMetadataConverter.NO_FILTER,
      ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  Map<String, String> metadataBad = parquetMetadataBad.getFileMetaData().getKeyValueMetaData();

  // should have DREMIO_ARROW_SCHEMA field, but no DREMIO_ARROW_SCHEMA_2_1
  assertTrue(metadataBad.containsKey(DREMIO_ARROW_SCHEMA));
  assertFalse(metadataBad.containsKey(DREMIO_ARROW_SCHEMA_2_1));

  try {
    DremioArrowSchema.fromMetaData(metadataBad);
    fail("Should not be able to process arrow schema");
  } catch (Exception e) {
    // ok
  }
}
 
Example 9
Source Project: parquet-mr   Source File: Schemas.java    License: Apache License 2.0 6 votes vote down vote up
public static Schema fromParquet(Configuration conf, URI location) throws IOException {
  Path path = new Path(location);
  FileSystem fs = path.getFileSystem(conf);

  ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), path);

  String schemaString = footer.getFileMetaData()
      .getKeyValueMetaData().get("parquet.avro.schema");
  if (schemaString == null) {
    // try the older property
    schemaString = footer.getFileMetaData()
        .getKeyValueMetaData().get("avro.schema");
  }

  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter()
        .convert(footer.getFileMetaData().getSchema());
  }
}
 
Example 10
Source Project: parquet-mr   Source File: TestStatistics.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}
 
Example 11
Source Project: parquet-mr   Source File: DumpCommand.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
    super.execute(options);

    String[] args = options.getArgs();
    String input = args[0];

    Configuration conf = new Configuration();
    Path inpath = new Path(input);

    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath, NO_FILTER);
    MessageType schema = metaData.getFileMetaData().getSchema();

    boolean showmd = !options.hasOption('m');
    boolean showdt = !options.hasOption('d');
    boolean cropoutput = !options.hasOption('n');

    Set<String> showColumns = null;
    if (options.hasOption('c')) {
        String[] cols = options.getOptionValues('c');
        showColumns = new HashSet<String>(Arrays.asList(cols));
    }

    PrettyPrintWriter out = prettyPrintWriter(cropoutput);
    dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}
 
Example 12
Source Project: parquet-mr   Source File: TransCompressionCommand.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);
  List<String> args = options.getArgList();
  Path inPath = new Path(args.get(0));
  Path outPath = new Path(args.get(1));
  CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2));

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}
 
Example 13
Source Project: parquet-mr   Source File: ParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Given a list of metadata files, merge them into a single ParquetMetadata
 * Requires that the schemas be compatible, and the extraMetadata be exactly equal.
 * @param files a list of files to merge metadata from
 * @param conf a configuration
 * @return merged parquet metadata for the files
 * @throws IOException if there is an error while writing
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files,  Configuration conf) throws IOException {
  Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");

  GlobalMetaData globalMetaData = null;
  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();

  for (Path p : files) {
    ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
    FileMetaData fmd = pmd.getFileMetaData();
    globalMetaData = mergeInto(fmd, globalMetaData, true);
    blocks.addAll(pmd.getBlocks());
  }

  // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
  return new ParquetMetadata(globalMetaData.merge(), blocks);
}
 
Example 14
Source Project: parquet-mr   Source File: TestPruneColumnsCommand.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testPruneMultiColumns() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove columns
  String cargs[] = {inputFile, outputFile, "Name", "Gender"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 2);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Links");
  List<Type> subFields = fields.get(1).asGroupType().getFields();
  assertEquals(subFields.size(), 2);
  assertEquals(subFields.get(0).getName(), "Backward");
  assertEquals(subFields.get(1).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Name", "Gender");
  validateColumns(inputFile, prunePaths);
}
 
Example 15
Source Project: parquet-mr   Source File: TestParquetMetadataConverter.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testParquetMetadataConverterWithoutDictionary()
  throws IOException {
  ParquetMetadata parquetMetaData =
    createParquetMetaData(null, Encoding.PLAIN);

  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData);

  // Flag should be false
  fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> {
    assertFalse(column.meta_data.isSetDictionary_page_offset());
  }));

  ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream();
  Util.writeFileMetaData(fmd1, metaDataOutputStream);
  ByteArrayInputStream metaDataInputStream =
    new ByteArrayInputStream(metaDataOutputStream.toByteArray());
  FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream);
  ParquetMetadata pmd2 = converter.fromParquetMetadata(fmd2);

  long dicOffsetConverted =
    pmd2.getBlocks().get(0).getColumns().get(0).getDictionaryPageOffset();

  Assert.assertEquals(0, dicOffsetConverted);
}
 
Example 16
Source Project: parquet-mr   Source File: PrintFooter.java    License: Apache License 2.0 6 votes vote down vote up
private static void add(ParquetMetadata footer) {
  for (BlockMetaData blockMetaData : footer.getBlocks()) {
    ++ blockCount;
    MessageType schema = footer.getFileMetaData().getSchema();
    recordCount += blockMetaData.getRowCount();
    List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
      add(
          desc,
          columnMetaData.getValueCount(),
          columnMetaData.getTotalSize(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getEncodings(),
          columnMetaData.getStatistics());
    }
  }
}
 
Example 17
Source Project: incubator-pinot   Source File: ParquetUtils.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Returns the schema for the given Parquet file path.
 */
public static Schema getParquetSchema(Path path)
    throws IOException {
  ParquetMetadata footer = ParquetFileReader.readFooter(getConfiguration(), path, ParquetMetadataConverter.NO_FILTER);
  Map<String, String> metaData = footer.getFileMetaData().getKeyValueMetaData();
  String schemaString = metaData.get("parquet.avro.schema");
  if (schemaString == null) {
    // Try the older property
    schemaString = metaData.get("avro.schema");
  }
  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema());
  }
}
 
Example 18
Source Project: Bats   Source File: ParquetReaderUtility.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Map full column paths to all ColumnDescriptors in file schema
 *
 * @param footer Parquet file metadata
 * @return       column full path to ColumnDescriptor object map
 */
public static Map<String, ColumnDescriptor> getColNameToColumnDescriptorMapping(ParquetMetadata footer) {
  Map<String, ColumnDescriptor> colDescMap = new HashMap<>();
  List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns();

  for (ColumnDescriptor column : columns) {
    colDescMap.put(getFullColumnPath(column), column);
  }
  return colDescMap;
}
 
Example 19
Source Project: parquet-mr   Source File: TestParquetWriterAppendBlocks.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testAllowDroppingColumns() throws IOException {
  MessageType droppedColumnSchema = Types.buildMessage()
      .required(BINARY).as(UTF8).named("string")
      .named("AppendTest");

  Path droppedColumnFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, droppedColumnSchema, droppedColumnFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  LinkedList<Group> expected = new LinkedList<Group>();
  expected.addAll(file1content);
  expected.addAll(file2content);

  ParquetMetadata footer = ParquetFileReader.readFooter(
      CONF, droppedColumnFile, NO_FILTER);
  for (BlockMetaData rowGroup : footer.getBlocks()) {
    Assert.assertEquals("Should have only the string column",
        1, rowGroup.getColumns().size());
  }

  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), droppedColumnFile)
      .build();

  Group next;
  while ((next = reader.read()) != null) {
    Group expectedNext = expected.removeFirst();
    Assert.assertEquals("Each string should match",
        expectedNext.getString("string", 0), next.getString("string", 0));
  }

  Assert.assertEquals("All records should be present", 0, expected.size());
}
 
Example 20
Source Project: Bats   Source File: ParquetRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
public ParquetRecordReader(FragmentContext fragmentContext,
    Path path,
    int rowGroupIndex,
    long numRecordsToRead,
    FileSystem fs,
    CodecFactory codecFactory,
    ParquetMetadata footer,
    List<SchemaPath> columns,
    ParquetReaderUtility.DateCorruptionStatus dateCorruptionStatus) {
  this(fragmentContext, numRecordsToRead, path, rowGroupIndex, fs, codecFactory, footer, columns, dateCorruptionStatus);
}
 
Example 21
Source Project: Bats   Source File: ParquetRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
public ParquetRecordReader(FragmentContext fragmentContext,
    Path path,
    int rowGroupIndex,
    FileSystem fs,
    CodecFactory codecFactory,
    ParquetMetadata footer,
    List<SchemaPath> columns,
    ParquetReaderUtility.DateCorruptionStatus dateCorruptionStatus) {
  this(fragmentContext, footer.getBlocks().get(rowGroupIndex).getRowCount(), path, rowGroupIndex, fs, codecFactory,
      footer, columns, dateCorruptionStatus);
}
 
Example 22
Source Project: Bats   Source File: ParquetRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
private int initNumRecordsToRead(long numRecordsToRead, int rowGroupIndex, ParquetMetadata footer) {
  // Callers can pass -1 if they want to read all rows.
  if (numRecordsToRead == NUM_RECORDS_TO_READ_NOT_SPECIFIED) {
    return (int) footer.getBlocks().get(rowGroupIndex).getRowCount();
  } else {
    assert (numRecordsToRead >= 0);
    return (int) Math.min(numRecordsToRead, footer.getBlocks().get(rowGroupIndex).getRowCount());
  }
}
 
Example 23
Source Project: Bats   Source File: FooterGatherer.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * An updated footer reader that tries to read the entire footer without knowing the length.
 * This should reduce the amount of seek/read roundtrips in most workloads.
 * @param fs
 * @param status
 * @return
 * @throws IOException
 */
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
  final FileSystem fs = status.getPath().getFileSystem(config);
  try(FSDataInputStream file = fs.open(status.getPath())) {

    final long fileLength = status.getLen();
    Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());

    int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE);
    byte[] footerBytes = new byte[len];
    readFully(file, fileLength - len, footerBytes, 0, len);

    checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
    final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);

    if(size > footerBytes.length - FOOTER_METADATA_SIZE){
      // if the footer is larger than our initial read, we need to read the rest.
      byte[] origFooterBytes = footerBytes;
      int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;

      footerBytes = new byte[size];

      readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
      System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
    }else{
      int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
      footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
    }

    final ByteArrayInputStream from = new ByteArrayInputStream(footerBytes);
    ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(from, NO_FILTER);
    Footer footer = new Footer(status.getPath(), metadata);
    return footer;
  }
}
 
Example 24
Source Project: parquet-mr   Source File: TestDataPageV1Checksums.java    License: Apache License 2.0 5 votes vote down vote up
/** Construct ParquetFileReader for input file and columns */
private ParquetFileReader getParquetFileReader(Path path, Configuration conf,
                                               List<ColumnDescriptor> columns)
  throws IOException {
  ParquetMetadata footer = ParquetFileReader.readFooter(conf, path);
  return new ParquetFileReader(conf, footer.getFileMetaData(), path,
    footer.getBlocks(), columns);
}
 
Example 25
Source Project: parquet-mr   Source File: TestMergeMetadataFiles.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testMergeMetadataFiles() throws Exception {
  WrittenFileInfo info = writeFiles(false);

  ParquetMetadata commonMeta1 = ParquetFileReader.readFooter(info.conf, info.commonMetaPath1, ParquetMetadataConverter.NO_FILTER);
  ParquetMetadata commonMeta2 = ParquetFileReader.readFooter(info.conf, info.commonMetaPath2, ParquetMetadataConverter.NO_FILTER);
  ParquetMetadata meta1 = ParquetFileReader.readFooter(info.conf, info.metaPath1, ParquetMetadataConverter.NO_FILTER);
  ParquetMetadata meta2 = ParquetFileReader.readFooter(info.conf, info.metaPath2, ParquetMetadataConverter.NO_FILTER);

  assertTrue(commonMeta1.getBlocks().isEmpty());
  assertTrue(commonMeta2.getBlocks().isEmpty());
  assertEquals(commonMeta1.getFileMetaData().getSchema(), commonMeta2.getFileMetaData().getSchema());

  assertFalse(meta1.getBlocks().isEmpty());
  assertFalse(meta2.getBlocks().isEmpty());
  assertEquals(meta1.getFileMetaData().getSchema(), meta2.getFileMetaData().getSchema());


  assertEquals(commonMeta1.getFileMetaData().getKeyValueMetaData(), commonMeta2.getFileMetaData().getKeyValueMetaData());
  assertEquals(meta1.getFileMetaData().getKeyValueMetaData(), meta2.getFileMetaData().getKeyValueMetaData());

  // test file serialization
  Path mergedOut = new Path(new File(temp.getRoot(), "merged_meta").getAbsolutePath());
  Path mergedCommonOut = new Path(new File(temp.getRoot(), "merged_common_meta").getAbsolutePath());
  ParquetFileWriter.writeMergedMetadataFile(Arrays.asList(info.metaPath1, info.metaPath2), mergedOut, info.conf);
  ParquetFileWriter.writeMergedMetadataFile(Arrays.asList(info.commonMetaPath1, info.commonMetaPath2), mergedCommonOut, info.conf);

  ParquetMetadata mergedMeta = ParquetFileReader.readFooter(info.conf, mergedOut, ParquetMetadataConverter.NO_FILTER);
  ParquetMetadata mergedCommonMeta = ParquetFileReader.readFooter(info.conf, mergedCommonOut, ParquetMetadataConverter.NO_FILTER);

  // ideally we'd assert equality here, but BlockMetaData and it's references don't implement equals
  assertEquals(meta1.getBlocks().size() + meta2.getBlocks().size(), mergedMeta.getBlocks().size());
  assertTrue(mergedCommonMeta.getBlocks().isEmpty());

  assertEquals(meta1.getFileMetaData().getSchema(), mergedMeta.getFileMetaData().getSchema());
  assertEquals(commonMeta1.getFileMetaData().getSchema(), mergedCommonMeta.getFileMetaData().getSchema());

  assertEquals(meta1.getFileMetaData().getKeyValueMetaData(), mergedMeta.getFileMetaData().getKeyValueMetaData());
  assertEquals(commonMeta1.getFileMetaData().getKeyValueMetaData(), mergedCommonMeta.getFileMetaData().getKeyValueMetaData());
}
 
Example 26
Source Project: parquet-mr   Source File: ParquetFileReader.java    License: Apache License 2.0 5 votes vote down vote up
static ParquetMetadata readSummaryMetadata(Configuration configuration, Path basePath, boolean skipRowGroups) throws IOException {
  Path metadataFile = new Path(basePath, PARQUET_METADATA_FILE);
  Path commonMetaDataFile = new Path(basePath, PARQUET_COMMON_METADATA_FILE);
  FileSystem fileSystem = basePath.getFileSystem(configuration);
  if (skipRowGroups && fileSystem.exists(commonMetaDataFile)) {
    // reading the summary file that does not contain the row groups
    LOG.info("reading summary file: {}", commonMetaDataFile);
    return readFooter(configuration, commonMetaDataFile, filter(skipRowGroups));
  } else if (fileSystem.exists(metadataFile)) {
    LOG.info("reading summary file: {}", metadataFile);
    return readFooter(configuration, metadataFile, filter(skipRowGroups));
  } else {
    return null;
  }
}
 
Example 27
Source Project: dremio-oss   Source File: LocalDictionariesReader.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Return dictionary per row group for all binary columns in given parquet file.
 * @param fs filesystem object.
 * @param filePath parquet file to scan
 * @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
 * @throws IOException
 */
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CompressionCodecFactory codecFactory) throws IOException {
  // Passing the max footer length is not required in this case as the parquet reader would already have failed.
  final ParquetMetadata parquetMetadata = SingletonParquetFooterCache.readFooter(fs, filePath, ParquetMetadataConverter.NO_FILTER,
    ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  if (parquetMetadata.getBlocks().size() > 1) {
    throw new IOException(
      format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
        parquetMetadata.getBlocks().size(), filePath));
  }
  final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
  final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();

  for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
    columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
  }

  final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
  final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
  try(final FSInputStream in = fs.open(filePath)) {
    for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
      if (isBinaryType(columnChunkMetaData.getType())) {
        final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
        // if first page is dictionary encoded then load dictionary, otherwise skip this column.
        final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
        if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
          dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
        } else {
          columnsToSkip.add(column);
        }
      }
    }
  }
  return new ImmutablePair<>(dictionaries, columnsToSkip);
}
 
Example 28
Source Project: dremio-oss   Source File: ParquetRowiseReader.java    License: Apache License 2.0 5 votes vote down vote up
public ParquetRowiseReader(OperatorContext context, ParquetMetadata footer, int rowGroupIndex, String path,
                            ParquetScanProjectedColumns projectedColumns, FileSystem fileSystem, SchemaDerivationHelper schemaHelper,
                            SimpleIntVector deltas, InputStreamProvider inputStreamProvider, CompressionCodecFactory codec,
                            boolean readEvenIfSchemaChanges) {
  super(context, projectedColumns.getBatchSchemaProjectedColumns(), deltas);
  this.footer = footer;
  this.fileSystem = fileSystem;
  this.rowGroupIndex = rowGroupIndex;
  this.path = path;
  this.schemaHelper = schemaHelper;
  this.inputStreamProvider = inputStreamProvider;
  this.codec = codec;
  this.projectedColumns = projectedColumns;
  this.readEvenIfSchemaChanges = readEvenIfSchemaChanges;
}
 
Example 29
Source Project: parquet-mr   Source File: TestParquetMetadataConverter.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testMetadataToJson() {
  ParquetMetadata metadata = new ParquetMetadata(null, null);
  assertEquals("{\"fileMetaData\":null,\"blocks\":null}", ParquetMetadata.toJSON(metadata));
  assertEquals("{\n" +
          "  \"fileMetaData\" : null,\n" +
          "  \"blocks\" : null\n" +
          "}", ParquetMetadata.toPrettyJSON(metadata));
}
 
Example 30
Source Project: dremio-oss   Source File: ParquetReaderUtility.java    License: Apache License 2.0 5 votes vote down vote up
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  HashMap<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
  for (SchemaElement se : fileMetaData.getSchema()) {
    schemaElements.put(se.getName(), se);
  }
  return schemaElements;
}