Java Code Examples for org.apache.parquet.hadoop.ParquetFileReader

The following examples show how to use org.apache.parquet.hadoop.ParquetFileReader. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: parquet-mr   Source File: TransCompressionCommand.java    License: Apache License 2.0 7 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(input != null && output != null,
    "Both input and output parquet file paths are required.");

  Preconditions.checkArgument(codec != null,
    "The codec cannot be null");

  Path inPath = new Path(input);
  Path outPath = new Path(output);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
  return 0;
}
 
Example 2
Source Project: garmadon   Source File: HdfsOffsetComputer.java    License: Apache License 2.0 6 votes vote down vote up
protected Long getMaxOffset(Map<String, FinalEventPartitionFile> dateFinalEventPartitionFile) {
    // Get max offset from all files for a partition
    return dateFinalEventPartitionFile
        .values()
        .stream()
        .flatMap(finalEventPartitionFile -> {
            try (ParquetFileReader pFR = ParquetFileReader.open(fs.getConf(), finalEventPartitionFile.getFilePath())) {
                return pFR.getFooter().getBlocks().stream();
            } catch (IOException e) {
                throw new RuntimeException(e);
            }

        })
        .map(b -> b.getColumns().stream()
            .filter(column -> Arrays.stream(column.getPath().toArray()).allMatch(path -> path.equals("kafka_offset")))
            .findFirst()
            .map(ColumnChunkMetaData::getStatistics)
            .map(Statistics::genericGetMax)
            .map(Long.class::cast)
            .orElse(NO_OFFSET))
        .mapToLong(Long::longValue)
        .max()
        .orElse(NO_OFFSET);
}
 
Example 3
Source Project: iceberg   Source File: TestMetricsRowGroupFilterTypes.java    License: Apache License 2.0 6 votes vote down vote up
public void createParquetInputFile(List<Record> records) throws IOException {
  if (PARQUET_FILE.exists()) {
    Assert.assertTrue(PARQUET_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(PARQUET_FILE);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericParquetWriter::buildWriter)
      .build()) {
    appender.addAll(records);
  }

  InputFile inFile = Files.localInput(PARQUET_FILE);
  try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) {
    Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
    rowGroupMetadata = reader.getRowGroups().get(0);
    parquetSchema = reader.getFileMetaData().getSchema();
  }

  PARQUET_FILE.deleteOnExit();
}
 
Example 4
Source Project: pxf   Source File: ParquetResolverTest.java    License: Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("deprecation")
private List<Group> readParquetFile(String file, long expectedSize, MessageType schema) throws IOException {
    List<Group> result = new ArrayList<>();
    String parquetFile = Objects.requireNonNull(getClass().getClassLoader().getResource("parquet/" + file)).getPath();
    Path path = new Path(parquetFile);

    ParquetFileReader fileReader = new ParquetFileReader(new Configuration(), path, ParquetMetadataConverter.NO_FILTER);
    PageReadStore rowGroup;
    while ((rowGroup = fileReader.readNextRowGroup()) != null) {
        MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
        RecordReader<Group> recordReader = columnIO.getRecordReader(rowGroup, new GroupRecordConverter(schema));
        long rowCount = rowGroup.getRowCount();
        for (long i = 0; i < rowCount; i++) {
            result.add(recordReader.read());
        }
    }
    fileReader.close();
    assertEquals(expectedSize, result.size());
    return result;
}
 
Example 5
Source Project: parquet-mr   Source File: Schemas.java    License: Apache License 2.0 6 votes vote down vote up
public static Schema fromParquet(Configuration conf, URI location) throws IOException {
  Path path = new Path(location);
  FileSystem fs = path.getFileSystem(conf);

  ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), path);

  String schemaString = footer.getFileMetaData()
      .getKeyValueMetaData().get("parquet.avro.schema");
  if (schemaString == null) {
    // try the older property
    schemaString = footer.getFileMetaData()
        .getKeyValueMetaData().get("avro.schema");
  }

  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter()
        .convert(footer.getFileMetaData().getSchema());
  }
}
 
Example 6
Source Project: incubator-pinot   Source File: ParquetUtils.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Returns the schema for the given Parquet file path.
 */
public static Schema getParquetSchema(Path path)
    throws IOException {
  ParquetMetadata footer = ParquetFileReader.readFooter(getConfiguration(), path, ParquetMetadataConverter.NO_FILTER);
  Map<String, String> metaData = footer.getFileMetaData().getKeyValueMetaData();
  String schemaString = metaData.get("parquet.avro.schema");
  if (schemaString == null) {
    // Try the older property
    schemaString = metaData.get("avro.schema");
  }
  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema());
  }
}
 
Example 7
Source Project: parquet-mr   Source File: SchemaCommand.java    License: Apache License 2.0 6 votes vote down vote up
private String getParquetSchema(String source) throws IOException {
  Formats.Format format;
  try (SeekableInput in = openSeekable(source)) {
    format = Formats.detectFormat((InputStream) in);
    in.seek(0);

    switch (format) {
      case PARQUET:
        return new ParquetFileReader(
            getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER)
            .getFileMetaData().getSchema().toString();
      default:
        throw new IllegalArgumentException(String.format(
            "Could not get a Parquet schema for format %s: %s", format, source));
    }
  }
}
 
Example 8
Source Project: kite   Source File: Schemas.java    License: Apache License 2.0 6 votes vote down vote up
public static Schema fromParquet(FileSystem fs, Path location) throws IOException {
  ParquetMetadata footer = ParquetFileReader.readFooter(fs.getConf(), location);

  String schemaString = footer.getFileMetaData()
      .getKeyValueMetaData().get("parquet.avro.schema");
  if (schemaString == null) {
    // try the older property
    schemaString = footer.getFileMetaData()
        .getKeyValueMetaData().get("avro.schema");
  }

  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter()
        .convert(footer.getFileMetaData().getSchema());
  }
}
 
Example 9
Source Project: tajo   Source File: ParquetReader.java    License: Apache License 2.0 6 votes vote down vote up
private ParquetReader(Configuration conf,
                      Path file,
                      ReadSupport<T> readSupport,
                      Filter filter) throws IOException {
  this.readSupport = readSupport;
  this.filter = checkNotNull(filter, "filter");
  this.conf = conf;

  FileSystem fs = file.getFileSystem(conf);
  List<FileStatus> statuses = Arrays.asList(fs.listStatus(file, HiddenFileFilter.INSTANCE));
  List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses, false);
  this.footersIterator = footers.iterator();

  for (Footer footer : footers) {
    for(BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      totalRowCount += block.getRowCount();
    }
  }
}
 
Example 10
Source Project: parquet-mr   Source File: ShowMetaCommand.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  boolean showOriginalTypes = options.hasOption('o');

  Configuration conf = new Configuration();
  Path inputPath = new Path(input);
  FileStatus inputFileStatus = inputPath.getFileSystem(conf).getFileStatus(inputPath);
  List<Footer> footers = ParquetFileReader.readFooters(conf, inputFileStatus, false);

  PrettyPrintWriter out = PrettyPrintWriter.stdoutPrettyPrinter()
                                           .withAutoColumn()
                                           .withWhitespaceHandler(WhiteSpaceHandler.COLLAPSE_WHITESPACE)
                                           .withColumnPadding(1)
                                           .build();

  for(Footer f: footers) {
    out.format("file: %s%n" , f.getFile());
    MetadataUtils.showDetails(out, f.getParquetMetadata(), showOriginalTypes);
    out.flushColumns();
  }
}
 
Example 11
Source Project: parquet-mr   Source File: TransCompressionCommand.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);
  List<String> args = options.getArgList();
  Path inPath = new Path(args.get(0));
  Path outPath = new Path(args.get(1));
  CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2));

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}
 
Example 12
Source Project: parquet-mr   Source File: TestPruneColumnsCommand.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testPruneMultiColumns() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove columns
  String cargs[] = {inputFile, outputFile, "Name", "Gender"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 2);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Links");
  List<Type> subFields = fields.get(1).asGroupType().getFields();
  assertEquals(subFields.size(), 2);
  assertEquals(subFields.get(0).getName(), "Backward");
  assertEquals(subFields.get(1).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Name", "Gender");
  validateColumns(inputFile, prunePaths);
}
 
Example 13
@Override
public List<IParquetInputField> readSchema( String file ) throws Exception {
  return inClassloader( () -> {
    Configuration conf = job.getConfiguration();
    S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, conf );
    Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
    FileSystem fs = FileSystem.get( filePath.toUri(), conf );
    FileStatus fileStatus = fs.getFileStatus( filePath );
    List<Footer> footers = ParquetFileReader.readFooters( conf, fileStatus, true );
    if ( footers.isEmpty() ) {
      return new ArrayList<>();
    } else {
      ParquetMetadata meta = footers.get( 0 ).getParquetMetadata();
      MessageType schema = meta.getFileMetaData().getSchema();
      return ParquetConverter.buildInputFields( schema );
    }
  } );
}
 
Example 14
Source Project: parquet-mr   Source File: CompressionConveterTest.java    License: Apache License 2.0 6 votes vote down vote up
private void convertCompression(Configuration conf, String inputFile, String outputFile, String codec) throws IOException {
  Path inPath = new Path(inputFile);
  Path outPath = new Path(outputFile);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}
 
Example 15
Source Project: parquet-mr   Source File: TestStatistics.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}
 
Example 16
Source Project: Bats   Source File: FooterGatherer.java    License: Apache License 2.0 5 votes vote down vote up
public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
  final List<TimedCallable<Footer>> readers = new ArrayList<>();
  final List<Footer> foundFooters = new ArrayList<>();
  for (FileStatus status : statuses) {


    if (status.isDirectory()){
      // first we check for summary file.
      FileSystem fs = status.getPath().getFileSystem(conf);

      final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
      if (fs.exists(summaryPath)){
        FileStatus summaryStatus = fs.getFileStatus(summaryPath);
        foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
        continue;
      }

      // else we handle as normal file.
      for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)){
        readers.add(new FooterReader(conf, inStatus));
      }
    } else {
      readers.add(new FooterReader(conf, status));
    }

  }
  if(!readers.isEmpty()){
    foundFooters.addAll(TimedCallable.run("Fetch Parquet Footers", logger, readers, parallelism));
  }

  return foundFooters;
}
 
Example 17
Source Project: flink   Source File: ParquetRecordReader.java    License: Apache License 2.0 5 votes vote down vote up
public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}
 
Example 18
Source Project: flink   Source File: ParquetInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public void open(FileInputSplit split) throws IOException {
	// reset the flag when open a new split
	this.skipThisSplit = false;
	org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration();
	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(split.getPath().toUri()), configuration);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);
	MessageType fileSchema = fileReader.getFileMetaData().getSchema();
	MessageType readSchema = getReadSchema(fileSchema, split.getPath());
	if (skipThisSplit) {
		LOG.warn(String.format(
			"Escaped the file split [%s] due to mismatch of file schema to expected result schema",
			split.getPath().toString()));
	} else {
		this.parquetRecordReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema,
			filterPredicate == null ? FilterCompat.NOOP : FilterCompat.get(filterPredicate));
		this.parquetRecordReader.initialize(fileReader, configuration);
		this.parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord);

		if (this.recordConsumed == null) {
			this.recordConsumed = getRuntimeContext().getMetricGroup().counter("parquet-records-consumed");
		}

		LOG.debug(String.format("Open ParquetInputFormat with FileInputSplit [%s]", split.getPath().toString()));
	}
}
 
Example 19
Source Project: flink   Source File: ParquetRecordReaderTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testReadSimpleGroup() throws IOException {
	Long[] array = {1L};
	GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
		.set("bar", "test")
		.set("foo", 32L)
		.set("arr", array).build();

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(3, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals("test", row.getField(1));
	assertArrayEquals(array, (Long[]) row.getField(2));
	assertTrue(rowReader.reachEnd());
}
 
Example 20
Source Project: flink   Source File: ParquetRecordReaderTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testReadMultipleSimpleGroup() throws IOException {
	Long[] array = {1L};

	List<IndexedRecord> records = new ArrayList<>();
	for (int i = 0; i < 100; i++) {
		GenericData.Record record = new GenericRecordBuilder(SIMPLE_SCHEMA)
			.set("bar", "test")
			.set("foo", i)
			.set("arr", array).build();
		records.add(record);
	}

	Path path = createTempParquetFile(tempRoot.getRoot(), SIMPLE_SCHEMA, records);
	MessageType readSchema = (new AvroSchemaConverter()).convert(SIMPLE_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertTrue(!rowReader.reachEnd());

	for (long i = 0; i < 100; i++) {
		assertFalse(rowReader.reachEnd());
		Row row = rowReader.nextRecord();
		assertEquals(3, row.getArity());
		assertEquals(i, row.getField(0));
		assertEquals("test", row.getField(1));
		assertArrayEquals(array, (Long[]) row.getField(2));
	}

	assertTrue(rowReader.reachEnd());
}
 
Example 21
Source Project: flink   Source File: ParquetRecordReaderTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testReadNestedGroup() throws IOException {
	Schema schema = unWrapSchema(NESTED_SCHEMA.getField("bar").schema());
	GenericData.Record barRecord = new GenericRecordBuilder(schema)
		.set("spam", 31L).build();

	GenericData.Record record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("bar", barRecord)
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());
	assertEquals(32L, row.getField(0));
	assertEquals(31L, ((Row) row.getField(2)).getField(0));
	assertTrue(rowReader.reachEnd());
}
 
Example 22
Source Project: flink   Source File: ParquetRecordReaderTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testMapGroup() throws IOException {
	Preconditions.checkState(unWrapSchema(NESTED_SCHEMA.getField("spamMap").schema())
		.getType().equals(Schema.Type.MAP));
	ImmutableMap.Builder<String, String> map = ImmutableMap.builder();
	map.put("testKey", "testValue");

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("spamMap", map.build())
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(32L, row.getField(0));
	Map<?, ?> result = (Map<?, ?>) row.getField(1);
	assertEquals(result.get("testKey").toString(), "testValue");
	assertTrue(rowReader.reachEnd());
}
 
Example 23
Source Project: garmadon   Source File: ProtoParquetWriterWithOffset.java    License: Apache License 2.0 5 votes vote down vote up
protected void mergeToFinalPath(Path lastAvailableFinalPath, Path finalPath) throws IOException {
    try (ParquetFileReader reader = ParquetFileReader.open(fs.getConf(), lastAvailableFinalPath)) {
        MessageType schema = reader.getFileMetaData().getSchema();
        if (!checkSchemaEquality(schema)) {
            LOGGER.warn("Schema between last available final file ({}) and temp file ({}) are not identical. We can't merge them",
                lastAvailableFinalPath, temporaryHdfsPath);
            moveToFinalPath(temporaryHdfsPath, finalPath);
        } else {
            Path mergedTempFile = new Path(temporaryHdfsPath.toString() + ".merged");

            if (fs.isFile(mergedTempFile)) fs.delete(mergedTempFile, false);

            Map<String, String> existingMetadata = reader.getFileMetaData().getKeyValueMetaData();
            Map<String, String> newMetadata = new HashMap<>(existingMetadata);
            newMetadata.put(LATEST_TIMESTAMP_META_KEY, String.valueOf(latestTimestamp));

            ParquetFileWriter writerPF = new ParquetFileWriter(fs.getConf(), schema, mergedTempFile);
            writerPF.start();
            try (
                ParquetFileReader dest = ParquetFileReader.open(fs.getConf(), lastAvailableFinalPath);
                ParquetFileReader temp = ParquetFileReader.open(fs.getConf(), temporaryHdfsPath)
            ) {
                dest.appendTo(writerPF);
                temp.appendTo(writerPF);
                writerPF.end(newMetadata);
            }

            moveToFinalPath(mergedTempFile, lastAvailableFinalPath);
            try {
                fs.delete(temporaryHdfsPath, false);
                // This file is in a temp folder that should be deleted at exit so we should not throw exception here
            } catch (IOException ignored) {
            }
        }
    }
}
 
Example 24
Source Project: garmadon   Source File: ProtoParquetWriterWithOffset.java    License: Apache License 2.0 5 votes vote down vote up
private boolean checkSchemaEquality(MessageType schema) throws IOException {
    try (ParquetFileReader pfr = ParquetFileReader.open(fs.getConf(), temporaryHdfsPath)) {
        MessageType schema2 = pfr.getFileMetaData().getSchema();

        return schema.equals(schema2);
    }
}
 
Example 25
Source Project: garmadon   Source File: ProtoParquetWriterWithOffset.java    License: Apache License 2.0 5 votes vote down vote up
private double getLatestCommittedTimestamp() {
    //there are cases for which we won't find a value for the latest committed timestamp
    // - the first time this code goes in, no file has the correct metadata
    // - for a new event type, we have no history too, so no value
    //By using the default value 'now' rather than 0, we prevent firing unnecessary alerts
    //However, if there is an actual problem and the reader never commits, it will eventually fire
    //an alert.
    long defaultValue = System.currentTimeMillis();
    try {
        Optional<Path> latestFileCommitted = getLastestExistingFinalPath();
        if (latestFileCommitted.isPresent()) {
            try (ParquetFileReader pfr = ParquetFileReader.open(fs.getConf(), latestFileCommitted.get())) {
                String timestamp = pfr
                        .getFooter()
                        .getFileMetaData()
                        .getKeyValueMetaData()
                        .getOrDefault(LATEST_TIMESTAMP_META_KEY, String.valueOf(defaultValue));
                return Double.valueOf(timestamp);
            }
        } else {
            return defaultValue;
        }
    } catch (IOException e) {
        LOGGER.warn("could not get last existing final path. Defaulting latest committed timestamp to 0");
        return defaultValue;
    }
}
 
Example 26
Source Project: garmadon   Source File: ProtoParquetWriterWithOffsetTest.java    License: Apache License 2.0 5 votes vote down vote up
private void checkFileLatestCommittedTimestamp(Path p, long timestamp) throws IOException {
    ParquetFileReader reader = new ParquetFileReader(
        HadoopInputFile.fromPath(p, new Configuration()),
        ParquetReadOptions.builder().build()
    );
    String actualTimestamp = reader.getFooter().getFileMetaData().getKeyValueMetaData().get(ProtoParquetWriterWithOffset.LATEST_TIMESTAMP_META_KEY);
    assertThat(actualTimestamp, is(String.valueOf(timestamp)));
}
 
Example 27
Source Project: iceberg   Source File: ReadConf.java    License: Apache License 2.0 5 votes vote down vote up
ParquetFileReader reader() {
  if (reader != null) {
    reader.setRequestedSchema(projection);
    return reader;
  }

  ParquetFileReader newReader = newReader(file, options);
  newReader.setRequestedSchema(projection);
  return newReader;
}
 
Example 28
Source Project: iceberg   Source File: ReadConf.java    License: Apache License 2.0 5 votes vote down vote up
private static ParquetFileReader newReader(InputFile file, ParquetReadOptions options) {
  try {
    return ParquetFileReader.open(ParquetIO.file(file), options);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to open Parquet file: %s", file.location());
  }
}
 
Example 29
Source Project: iceberg   Source File: ParquetUtil.java    License: Apache License 2.0 5 votes vote down vote up
public static Metrics fileMetrics(InputFile file, MetricsConfig metricsConfig) {
  try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(file))) {
    return footerMetrics(reader.getFooter(), metricsConfig);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to read footer of file: %s", file);
  }
}
 
Example 30
Source Project: iceberg   Source File: TestParquet.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testRowGroupSizeConfigurable() throws IOException {
  // Without an explicit writer function
  File parquetFile = generateFileWithTwoRowGroups(null);

  try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(localInput(parquetFile)))) {
    Assert.assertEquals(2, reader.getRowGroups().size());
  }
}