org.apache.parquet.hadoop.metadata.ParquetMetadata Java Examples

The following examples show how to use org.apache.parquet.hadoop.metadata.ParquetMetadata. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetReaderUtility.java    From Bats with Apache License 2.0 6 votes vote down vote up
/**
 * Map full schema paths in format `a`.`b`.`c` to respective SchemaElement objects.
 *
 * @param footer Parquet file metadata
 * @return       schema full path to SchemaElement map
 */
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  Map<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);

  Iterator<SchemaElement> iter = fileMetaData.getSchema().iterator();

  // First element in collection is default `root` element. We skip it to maintain key in `a` format instead of `root`.`a`,
  // and thus to avoid the need to cut it out again when comparing with SchemaPath string representation
  if (iter.hasNext()) {
    iter.next();
  }
  while (iter.hasNext()) {
    addSchemaElementMapping(iter, new StringBuilder(), schemaElements);
  }
  return schemaElements;
}
 
Example #2
Source File: TestParquetReader.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
@Test
public void testArrowSchemaOldInFooter() throws Exception {
  URL badparquet = getClass().getResource("/types.parquet");

  Path filePathBad = Path.of(badparquet.toURI());
  ParquetMetadata parquetMetadataBad =
    SingletonParquetFooterCache.readFooter(localFs, filePathBad, ParquetMetadataConverter.NO_FILTER,
      ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  Map<String, String> metadataBad = parquetMetadataBad.getFileMetaData().getKeyValueMetaData();

  // should have DREMIO_ARROW_SCHEMA field, but no DREMIO_ARROW_SCHEMA_2_1
  assertTrue(metadataBad.containsKey(DREMIO_ARROW_SCHEMA));
  assertFalse(metadataBad.containsKey(DREMIO_ARROW_SCHEMA_2_1));

  try {
    DremioArrowSchema.fromMetaData(metadataBad);
    fail("Should not be able to process arrow schema");
  } catch (Exception e) {
    // ok
  }
}
 
Example #3
Source File: Schemas.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static Schema fromParquet(Configuration conf, URI location) throws IOException {
  Path path = new Path(location);
  FileSystem fs = path.getFileSystem(conf);

  ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), path);

  String schemaString = footer.getFileMetaData()
      .getKeyValueMetaData().get("parquet.avro.schema");
  if (schemaString == null) {
    // try the older property
    schemaString = footer.getFileMetaData()
        .getKeyValueMetaData().get("avro.schema");
  }

  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter()
        .convert(footer.getFileMetaData().getSchema());
  }
}
 
Example #4
Source File: TestParquetReader.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
@Test
public void testArrowSchema210InFooter() throws Exception {
  URL parquet210 = getClass().getResource("/dremio-region-210.parquet");
  Path filePath210 = Path.of(parquet210.toURI());
  ParquetMetadata parquetMetadata210 =
    SingletonParquetFooterCache.readFooter(localFs, filePath210, ParquetMetadataConverter.NO_FILTER,
      ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  Map<String, String> metadata210 = parquetMetadata210.getFileMetaData().getKeyValueMetaData();

  // should not have DREMIO_ARROW_SCHEMA field, but should have DREMIO_ARROW_SCHEMA_2_1
  assertFalse(metadata210.containsKey(DREMIO_ARROW_SCHEMA));
  assertTrue(metadata210.containsKey(DREMIO_ARROW_SCHEMA_2_1));

  Schema schema210 = DremioArrowSchema.fromMetaData(metadata210);

  assertNotNull(schema210);
}
 
Example #5
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}
 
Example #6
Source File: ParquetReaderUtility.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
/**
 * Get the list of row group numbers for given file input split. Logic used here is same as how Hive's parquet input
 * format finds the row group numbers for input split.
 */
public static List<Integer> getRowGroupNumbersFromFileSplit(final long splitStart, final long splitLength,
                                                             final ParquetMetadata footer) throws IOException {
  final List<BlockMetaData> blocks = footer.getBlocks();
  final List<Integer> rowGroupNums = Lists.newArrayList();

  int i = 0;
  for (final BlockMetaData block : blocks) {
    final long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset();
    if (firstDataPage >= splitStart && firstDataPage < splitStart + splitLength) {
      rowGroupNums.add(i);
    }
    i++;
  }

  return rowGroupNums;
}
 
Example #7
Source File: HiveProtoParquetWriterWithOffsetTest.java    From garmadon with Apache License 2.0 6 votes vote down vote up
@Before
public void setup() throws IOException {
    protoParquetWriterWithOffset = mock(ProtoParquetWriterWithOffset.class);
    hiveClient = mock(HiveClient.class);

    when(protoParquetWriterWithOffset.getEventName()).thenReturn(eventName);
    when(protoParquetWriterWithOffset.getFinalHdfsDir()).thenReturn(finalPath);
    ProtoParquetWriter<Message> writerMock = mock(ProtoParquetWriter.class);
    when(protoParquetWriterWithOffset.getWriter()).thenReturn(writerMock);
    ParquetMetadata parquetMetadata = mock(ParquetMetadata.class);
    when(writerMock.getFooter()).thenReturn(parquetMetadata);

    PrimitiveType appId = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.BINARY, "app_id");
    schema = new MessageType("fs", appId);
    FileMetaData fileMetaData = new FileMetaData(schema, new HashMap<String, String>(), "test");
    when(parquetMetadata.getFileMetaData()).thenReturn(fileMetaData);

    when(protoParquetWriterWithOffset.getDayStartTime()).thenReturn(LocalDateTime.of(2019, 9, 10, 10, 10, 10));
}
 
Example #8
Source File: CompressionConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema,
                           String createdBy, CompressionCodecName codecName) throws IOException {
  int blockIndex = 0;
  PageReadStore store = reader.readNextRowGroup();
  while (store != null) {
    writer.startBlock(store.getRowCount());
    BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex);
    List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns();
    Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(
      Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
      ColumnChunkMetaData chunk = columnsInOrder.get(i);
      ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy);
      ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath());
      writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName);
      processChunk(reader, writer, chunk, createdBy, codecName);
      writer.endColumn();
    }
    writer.endBlock();
    store = reader.readNextRowGroup();
    blockIndex++;
  }
}
 
Example #9
Source File: DumpCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
    super.execute(options);

    String[] args = options.getArgs();
    String input = args[0];

    Configuration conf = new Configuration();
    Path inpath = new Path(input);

    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath, NO_FILTER);
    MessageType schema = metaData.getFileMetaData().getSchema();

    boolean showmd = !options.hasOption('m');
    boolean showdt = !options.hasOption('d');
    boolean cropoutput = !options.hasOption('n');

    Set<String> showColumns = null;
    if (options.hasOption('c')) {
        String[] cols = options.getOptionValues('c');
        showColumns = new HashSet<String>(Arrays.asList(cols));
    }

    PrettyPrintWriter out = prettyPrintWriter(cropoutput);
    dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}
 
Example #10
Source File: TransCompressionCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);
  List<String> args = options.getArgList();
  Path inPath = new Path(args.get(0));
  Path outPath = new Path(args.get(1));
  CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2));

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}
 
Example #11
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Given a list of metadata files, merge them into a single ParquetMetadata
 * Requires that the schemas be compatible, and the extraMetadata be exactly equal.
 * @param files a list of files to merge metadata from
 * @param conf a configuration
 * @return merged parquet metadata for the files
 * @throws IOException if there is an error while writing
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files,  Configuration conf) throws IOException {
  Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");

  GlobalMetaData globalMetaData = null;
  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();

  for (Path p : files) {
    ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
    FileMetaData fmd = pmd.getFileMetaData();
    globalMetaData = mergeInto(fmd, globalMetaData, true);
    blocks.addAll(pmd.getBlocks());
  }

  // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
  return new ParquetMetadata(globalMetaData.merge(), blocks);
}
 
Example #12
Source File: PentahoApacheInputFormat.java    From pentaho-hadoop-shims with Apache License 2.0 6 votes vote down vote up
@Override
public List<IParquetInputField> readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    Configuration conf = job.getConfiguration();
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf );
    Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
    FileSystem fs = FileSystem.get( filePath.toUri(), conf );
    FileStatus fileStatus = fs.getFileStatus( filePath );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new ArrayList<>();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.buildInputFields( schema );
    }
  } );
}
 
Example #13
Source File: TestParquetWriterAppendBlocks.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testFailDroppingColumns() throws IOException {
  MessageType droppedColumnSchema = Types.buildMessage()
      .required(BINARY).as(UTF8).named("string")
      .named("AppendTest");

  final ParquetMetadata footer = ParquetFileReader.readFooter(
      CONF, file1, NO_FILTER);
  final FSDataInputStream incoming = file1.getFileSystem(CONF).open(file1);

  Path droppedColumnFile = newTemp();
  final ParquetFileWriter writer = new ParquetFileWriter(
      CONF, droppedColumnSchema, droppedColumnFile);
  writer.start();

  TestUtils.assertThrows("Should complain that id column is dropped",
      IllegalArgumentException.class,
    (Callable<Void>) () -> {
      writer.appendRowGroups(incoming, footer.getBlocks(), false);
      return null;
    });
}
 
Example #14
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testPruneMultiColumns() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove columns
  String cargs[] = {inputFile, outputFile, "Name", "Gender"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 2);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Links");
  List<Type> subFields = fields.get(1).asGroupType().getFields();
  assertEquals(subFields.size(), 2);
  assertEquals(subFields.get(0).getName(), "Backward");
  assertEquals(subFields.get(1).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Name", "Gender");
  validateColumns(inputFile, prunePaths);
}
 
Example #15
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testParquetMetadataConverterWithoutDictionary()
  throws IOException {
  ParquetMetadata parquetMetaData =
    createParquetMetaData(null, Encoding.PLAIN);

  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData);

  // Flag should be false
  fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> {
    assertFalse(column.meta_data.isSetDictionary_page_offset());
  }));

  ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream();
  Util.writeFileMetaData(fmd1, metaDataOutputStream);
  ByteArrayInputStream metaDataInputStream =
    new ByteArrayInputStream(metaDataOutputStream.toByteArray());
  FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream);
  ParquetMetadata pmd2 = converter.fromParquetMetadata(fmd2);

  long dicOffsetConverted =
    pmd2.getBlocks().get(0).getColumns().get(0).getDictionaryPageOffset();

  Assert.assertEquals(0, dicOffsetConverted);
}
 
Example #16
Source File: ParquetUtils.java    From incubator-pinot with Apache License 2.0 6 votes vote down vote up
/**
 * Returns the schema for the given Parquet file path.
 */
public static Schema getParquetSchema(Path path)
    throws IOException {
  ParquetMetadata footer = ParquetFileReader.readFooter(getConfiguration(), path, ParquetMetadataConverter.NO_FILTER);
  Map<String, String> metaData = footer.getFileMetaData().getKeyValueMetaData();
  String schemaString = metaData.get("parquet.avro.schema");
  if (schemaString == null) {
    // Try the older property
    schemaString = metaData.get("avro.schema");
  }
  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema());
  }
}
 
Example #17
Source File: PrintFooter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void add(ParquetMetadata footer) {
  for (BlockMetaData blockMetaData : footer.getBlocks()) {
    ++ blockCount;
    MessageType schema = footer.getFileMetaData().getSchema();
    recordCount += blockMetaData.getRowCount();
    List<ColumnChunkMetaData> columns = blockMetaData.getColumns();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray());
      add(
          desc,
          columnMetaData.getValueCount(),
          columnMetaData.getTotalSize(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getEncodings(),
          columnMetaData.getStatistics());
    }
  }
}
 
Example #18
Source File: MetadataUtils.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
static void showDetails(PrettyPrintWriter out, ParquetMetadata meta, boolean showOriginalTypes) {
  showDetails(out, meta.getFileMetaData(), showOriginalTypes);

  long i = 1;
  for (BlockMetaData bmeta : meta.getBlocks()) {
    out.println();
    showDetails(out, bmeta, i++);
  }
}
 
Example #19
Source File: StreamPerColumnProvider.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
@Override
public ParquetMetadata getFooter() throws IOException {
  if(footer == null) {
    SingletonParquetFooterCache footerCache = new SingletonParquetFooterCache();
    footer = footerCache.getFooter(getStream(null), path.toString(), length, fs, maxFooterLen);
  }
  return footer;
}
 
Example #20
Source File: DictionaryFilterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() throws Exception {
  reader = ParquetFileReader.open(conf, file);
  ParquetMetadata meta = reader.getFooter();
  ccmd = meta.getBlocks().get(0).getColumns();
  dictionaries = reader.getDictionaryReader(meta.getBlocks().get(0));
}
 
Example #21
Source File: ColumnSizeCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException {
  Map<String, Long> colSizes = new HashMap<>();
  ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER);

  for (BlockMetaData block : pmd.getBlocks()) {
    for (ColumnChunkMetaData column : block.getColumns()) {
      String colName = column.getPath().toDotString();
      colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L));
    }
  }

  return colSizes;
}
 
Example #22
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testPruneOneColumn() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove column
  String cargs[] = {inputFile, outputFile, "Gender"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 3);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Name");
  assertEquals(fields.get(2).getName(), "Links");
  List<Type> subFields = fields.get(2).asGroupType().getFields();
  assertEquals(subFields.size(), 2);
  assertEquals(subFields.get(0).getName(), "Backward");
  assertEquals(subFields.get(1).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Gender");
  validateColumns(inputFile, prunePaths);
}
 
Example #23
Source File: ParquetReaderFactory.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
RecordReader newReader(OperatorContext context,
ParquetScanProjectedColumns projectedColumns,
String path,
CompressionCodecFactory codecFactory,
List<ParquetFilterCondition> conditions,
ParquetFilterCreator filterCreator,
ParquetDictionaryConvertor dictionaryConvertor,
boolean enableDetailedTracing,
ParquetMetadata footer,
int rowGroupIndex,
SimpleIntVector deltas,
SchemaDerivationHelper schemaHelper,
InputStreamProvider inputStreamProvider);
 
Example #24
Source File: TestConvertAvroToParquet.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Test
public void test_Meta_Info() throws Exception {

    FileInputStream fileInputStream = new FileInputStream(tmpAvro);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    int readedBytes;
    byte[] buf = new byte[1024];
    while ((readedBytes = fileInputStream.read(buf)) > 0) {
        out.write(buf, 0, readedBytes);
    }
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToParquet.SUCCESS).get(0);

    // Save the flowfile
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream(tmpParquet);
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    ParquetMetadata metaData;
    metaData = ParquetFileReader.readFooter(conf, new Path(tmpParquet.getAbsolutePath()), NO_FILTER);

    // #number of records
    long nParquetRecords = 0;
    for(BlockMetaData meta : metaData.getBlocks()){
        nParquetRecords += meta.getRowCount();
    }
    long nAvroRecord = records.size();

    assertEquals(nParquetRecords, nAvroRecord);
}
 
Example #25
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static List<PageReadStore> readBlocksFromFile(Path file) throws IOException {
  List<PageReadStore> rowGroups = new ArrayList<PageReadStore>();

  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, file, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader fileReader = new ParquetFileReader(configuration, metadata.getFileMetaData(), file, metadata.getBlocks(),
      metadata.getFileMetaData().getSchema().getColumns());

  PageReadStore group;
  while ((group = fileReader.readNextRowGroup()) != null) {
    rowGroups.add(group);
  }

  return rowGroups;
}
 
Example #26
Source File: TestParquetWriter.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public void runTestAndValidate(String selection, String validationSelection, String inputTable, String outputFile, boolean sort) throws Exception {
  try {
    deleteTableIfExists(outputFile);
    test("use dfs_test");
//    test("ALTER SESSION SET \"planner.add_producer_consumer\" = false");
    String query = select(selection, inputTable, sort);
    System.out.println(outputFile);
    String create = "CREATE TABLE " + outputFile + " AS " + query;
    String validateQuery = select(validationSelection, outputFile, sort);
    test(create);
    test(validateQuery); // TODO: remove
    testBuilder()
        .unOrdered()
        .sqlQuery(validateQuery)
        .sqlBaselineQuery(query)
        .go();

    Configuration hadoopConf = new Configuration();
    Path output = new Path(getDfsTestTmpSchemaLocation(), outputFile);
    FileSystem fs = output.getFileSystem(hadoopConf);
    for (FileStatus file : fs.listStatus(output)) {
      ParquetMetadata footer = ParquetFileReader.readFooter(hadoopConf, file, SKIP_ROW_GROUPS);
      String version = footer.getFileMetaData().getKeyValueMetaData().get(DREMIO_VERSION_PROPERTY);
      assertEquals(DremioVersionInfo.getVersion(), version);
      PageHeaderUtil.validatePageHeaders(file.getPath(), footer);
    }
  } finally {
    deleteTableIfExists(outputFile);
  }
}
 
Example #27
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * ends a file once all blocks have been written.
 * closes the file.
 * @param extraMetaData the extra meta data to write in the footer
 * @throws IOException if there is an error while writing
 */
public void end(Map<String, String> extraMetaData) throws IOException {
  state = state.end();
  serializeColumnIndexes(columnIndexes, blocks, out);
  serializeOffsetIndexes(offsetIndexes, blocks, out);
  serializeBloomFilters(bloomFilters, blocks, out);
  LOG.debug("{}: end", out.getPos());
  this.footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks);
  serializeFooter(footer, out);
  out.close();
}
 
Example #28
Source File: ParquetReaderUtility.java    From Bats with Apache License 2.0 5 votes vote down vote up
/**
 * Map full column paths to all ColumnDescriptors in file schema
 *
 * @param footer Parquet file metadata
 * @return       column full path to ColumnDescriptor object map
 */
public static Map<String, ColumnDescriptor> getColNameToColumnDescriptorMapping(ParquetMetadata footer) {
  Map<String, ColumnDescriptor> colDescMap = new HashMap<>();
  List<ColumnDescriptor> columns = footer.getFileMetaData().getSchema().getColumns();

  for (ColumnDescriptor column : columns) {
    colDescMap.put(getFullColumnPath(column), column);
  }
  return colDescMap;
}
 
Example #29
Source File: IcebergParquetReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public IcebergParquetReader(
  OperatorContext context,
  ParquetReaderFactory readerFactory,
  BatchSchema tableSchema,
  ParquetScanProjectedColumns projectedColumns,
  Map<String, GlobalDictionaryFieldInfo> globalDictionaryFieldInfoMap,
  List<ParquetFilterCondition> filterConditions,
  ParquetProtobuf.ParquetDatasetSplitScanXAttr readEntry,
  FileSystem fs,
  ParquetMetadata footer,
  GlobalDictionaries dictionaries,
  SchemaDerivationHelper schemaHelper,
  boolean vectorize,
  boolean enableDetailedTracing,
  boolean supportsColocatedReads,
  InputStreamProvider inputStreamProvider) {
  this.context = context;
  this.readerFactory = readerFactory;
  this.tableSchema = tableSchema;
  this.projectedColumns = projectedColumns;
  this.globalDictionaryFieldInfoMap = globalDictionaryFieldInfoMap;
  this.filterConditions = filterConditions;
  this.readEntry = readEntry;
  this.fs = fs;
  this.footer = footer;
  this.dictionaries = dictionaries;
  this.schemaHelper = schemaHelper;
  this.vectorize = vectorize;
  this.enableDetailedTracing = enableDetailedTracing;
  this.supportsColocatedReads = supportsColocatedReads;
  this.inputStreamProvider = inputStreamProvider;
}
 
Example #30
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) {
  GlobalMetaData fileMetaData = null;
  for (Footer footer : footers) {
    ParquetMetadata currentMetadata = footer.getParquetMetadata();
    fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict);
  }
  return fileMetaData;
}