Java Code Examples for org.apache.parquet.hadoop.ParquetFileReader#open()

The following examples show how to use org.apache.parquet.hadoop.ParquetFileReader#open() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ProtoParquetWriterWithOffset.java From garmadon with Apache License 2.0

5 votes

protected void mergeToFinalPath(Path lastAvailableFinalPath, Path finalPath) throws IOException {
    try (ParquetFileReader reader = ParquetFileReader.open(fs.getConf(), lastAvailableFinalPath)) {
        MessageType schema = reader.getFileMetaData().getSchema();
        if (!checkSchemaEquality(schema)) {
            LOGGER.warn("Schema between last available final file ({}) and temp file ({}) are not identical. We can't merge them",
                lastAvailableFinalPath, temporaryHdfsPath);
            moveToFinalPath(temporaryHdfsPath, finalPath);
        } else {
            Path mergedTempFile = new Path(temporaryHdfsPath.toString() + ".merged");

            if (fs.isFile(mergedTempFile)) fs.delete(mergedTempFile, false);

            Map<String, String> existingMetadata = reader.getFileMetaData().getKeyValueMetaData();
            Map<String, String> newMetadata = new HashMap<>(existingMetadata);
            newMetadata.put(LATEST_TIMESTAMP_META_KEY, String.valueOf(latestTimestamp));

            ParquetFileWriter writerPF = new ParquetFileWriter(fs.getConf(), schema, mergedTempFile);
            writerPF.start();
            try (
                ParquetFileReader dest = ParquetFileReader.open(fs.getConf(), lastAvailableFinalPath);
                ParquetFileReader temp = ParquetFileReader.open(fs.getConf(), temporaryHdfsPath)
            ) {
                dest.appendTo(writerPF);
                temp.appendTo(writerPF);
                writerPF.end(newMetadata);
            }

            moveToFinalPath(mergedTempFile, lastAvailableFinalPath);
            try {
                fs.delete(temporaryHdfsPath, false);
                // This file is in a temp folder that should be deleted at exit so we should not throw exception here
            } catch (IOException ignored) {
            }
        }
    }
}

Example 2

Source File: TestDictionaryRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

@BeforeClass
public static void createInputFile() throws IOException {
  if (PARQUET_FILE.exists()) {
    Assert.assertTrue(PARQUET_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(PARQUET_FILE);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .build()) {
    GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table"));
    // create 20 copies of each record to ensure dictionary-encoding
    for (int copy = 0; copy < 20; copy += 1) {
      // create 50 records
      for (int i = 0; i < 50; i += 1) {
        builder.set("_id", 30 + i); // min=30, max=79, num-nulls=0
        builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats
        builder.set("_required", "req"); // required, always non-null
        builder.set("_all_nulls", null); // never non-null
        builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
        builder.set("_no_nulls", ""); // optional, but always non-null
        builder.set("_non_dict", UUID.randomUUID().toString()); // not dictionary-encoded
        appender.add(builder.build());
      }
    }
  }

  InputFile inFile = Files.localInput(PARQUET_FILE);

  ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inFile));

  Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
  ROW_GROUP_METADATA = reader.getRowGroups().get(0);
  PARQUET_SCHEMA = reader.getFileMetaData().getSchema();
  DICTIONARY_STORE = reader.getNextDictionaryReader();

  PARQUET_FILE.deleteOnExit();
}

Example 3

Source File: TestMetricsRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

@BeforeClass
public static void createInputFile() throws IOException {
  if (PARQUET_FILE.exists()) {
    Assert.assertTrue(PARQUET_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(PARQUET_FILE);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .build()) {
    GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table"));
    // create 50 records
    for (int i = 0; i < 50; i += 1) {
      builder.set("_id", 30 + i); // min=30, max=79, num-nulls=0
      builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats
      builder.set("_required", "req"); // required, always non-null
      builder.set("_all_nulls", null); // never non-null
      builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
      builder.set("_no_nulls", ""); // optional, but always non-null
      appender.add(builder.build());
    }
  }

  InputFile inFile = Files.localInput(PARQUET_FILE);
  try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inFile))) {
    Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
    ROW_GROUP_METADATA = reader.getRowGroups().get(0);
    PARQUET_SCHEMA = reader.getFileMetaData().getSchema();
  }

  PARQUET_FILE.deleteOnExit();
}

Example 4

Source File: ParquetReader.java From iceberg with Apache License 2.0

5 votes

private static ParquetFileReader newReader(InputFile file, ParquetReadOptions options) {
  try {
    return ParquetFileReader.open(ParquetIO.file(file), options);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to open Parquet file: %s", file.location());
  }
}

Example 5

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Load Decision Tree model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadDecisionTreeModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, NodeData> nodes = new TreeMap<>();

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));

            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                NodeData nodeData = extractNodeDataFromParquetRow(g);
                nodes.put(nodeData.id, nodeData);
            }
        }
        return buildDecisionTreeModel(nodes);
    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}

Example 6

Source File: ParquetFileLineFetcher.java From hugegraph-loader with Apache License 2.0

5 votes

@Override
public void openReader(Readable readable) {
    Path path = new Path(this.source().path());
    try {
        HadoopInputFile file = HadoopInputFile.fromPath(path, this.conf);
        this.reader = ParquetFileReader.open(file);
        this.schema = this.reader.getFooter().getFileMetaData().getSchema();
        this.columnIO = new ColumnIOFactory().getColumnIO(this.schema);
    } catch (IOException e) {
        throw new LoadException("Failed to open parquet reader for '%s'",
                                e, readable);
    }
    this.resetOffset();
}

Example 7

Source File: DataLoad.java From arvo2parquet with MIT License

5 votes

private static void extractMetaDataFooter(final Path parquetFilePath) throws IOException {
  try (final ParquetFileReader rdr = ParquetFileReader.open(nioPathToInputFile(parquetFilePath))) {
    final ParquetMetadata footer = rdr.getFooter();
    final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE + "_dup.parquet");
    Files.deleteIfExists(metaDataOutPath);
    try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) {
      serializeFooter(footer, out);
    }
  }
}

Example 8

Source File: DictionaryFilterTest.java From parquet-mr with Apache License 2.0

5 votes

@Before
public void setUp() throws Exception {
  reader = ParquetFileReader.open(conf, file);
  ParquetMetadata meta = reader.getFooter();
  ccmd = meta.getBlocks().get(0).getColumns();
  dictionaries = reader.getDictionaryReader(meta.getBlocks().get(0));
}

Example 9

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Load logistic regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLogRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readInterceptor(g);
                coefficients = readCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LogisticRegressionModel(coefficients, interceptor);
}

Example 10

Source File: TestParquet.java From iceberg with Apache License 2.0

5 votes

@Test
public void testRowGroupSizeConfigurable() throws IOException {
  // Without an explicit writer function
  File parquetFile = generateFileWithTwoRowGroups(null);

  try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(localInput(parquetFile)))) {
    Assert.assertEquals(2, reader.getRowGroups().size());
  }
}

Example 11

Source File: ParquetUtil.java From iceberg with Apache License 2.0

5 votes

public static Metrics fileMetrics(InputFile file, MetricsConfig metricsConfig) {
  try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(file))) {
    return footerMetrics(reader.getFooter(), metricsConfig);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to read footer of file: %s", file);
  }
}

Example 12

Source File: ReadConf.java From iceberg with Apache License 2.0

5 votes

private static ParquetFileReader newReader(InputFile file, ParquetReadOptions options) {
  try {
    return ParquetFileReader.open(ParquetIO.file(file), options);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to open Parquet file: %s", file.location());
  }
}

Example 13

Source File: SparkModelParser.java From ignite with Apache License 2.0

5 votes

/**
 * Load linear regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readLinRegInterceptor(g);
                coefficients = readLinRegCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LinearRegressionModel(coefficients, interceptor);
}

Example 14

Source File: ProtoParquetWriterWithOffset.java From garmadon with Apache License 2.0

5 votes

private double getLatestCommittedTimestamp() {
    //there are cases for which we won't find a value for the latest committed timestamp
    // - the first time this code goes in, no file has the correct metadata
    // - for a new event type, we have no history too, so no value
    //By using the default value 'now' rather than 0, we prevent firing unnecessary alerts
    //However, if there is an actual problem and the reader never commits, it will eventually fire
    //an alert.
    long defaultValue = System.currentTimeMillis();
    try {
        Optional<Path> latestFileCommitted = getLastestExistingFinalPath();
        if (latestFileCommitted.isPresent()) {
            try (ParquetFileReader pfr = ParquetFileReader.open(fs.getConf(), latestFileCommitted.get())) {
                String timestamp = pfr
                        .getFooter()
                        .getFileMetaData()
                        .getKeyValueMetaData()
                        .getOrDefault(LATEST_TIMESTAMP_META_KEY, String.valueOf(defaultValue));
                return Double.valueOf(timestamp);
            }
        } else {
            return defaultValue;
        }
    } catch (IOException e) {
        LOGGER.warn("could not get last existing final path. Defaulting latest committed timestamp to 0");
        return defaultValue;
    }
}

Example 15

Source File: ProtoParquetWriterWithOffset.java From garmadon with Apache License 2.0

5 votes

private boolean checkSchemaEquality(MessageType schema) throws IOException {
    try (ParquetFileReader pfr = ParquetFileReader.open(fs.getConf(), temporaryHdfsPath)) {
        MessageType schema2 = pfr.getFileMetaData().getSchema();

        return schema.equals(schema2);
    }
}

Example 16

Source File: TestDictionaryRowGroupFilter.java From iceberg with Apache License 2.0

4 votes

@BeforeClass
public static void createInputFile() throws IOException {
  if (PARQUET_FILE.exists()) {
    Assert.assertTrue(PARQUET_FILE.delete());
  }

  // build struct field schema
  org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(_structFieldType);

  OutputFile outFile = Files.localOutput(PARQUET_FILE);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .build()) {
    GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table"));
    // create 20 copies of each record to ensure dictionary-encoding
    for (int copy = 0; copy < 20; copy += 1) {
      // create 50 records
      for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) {
        builder.set("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0
        builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats
        builder.set("_required", "req"); // required, always non-null
        builder.set("_all_nulls", null); // never non-null
        builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
        builder.set("_no_nulls", ""); // optional, but always non-null
        builder.set("_non_dict", UUID.randomUUID().toString()); // not dictionary-encoded

        Record structNotNull = new Record(structSchema);
        structNotNull.put("_int_field", INT_MIN_VALUE + i);
        builder.set("_struct_not_null", structNotNull); // struct with int

        appender.add(builder.build());
      }
    }
  }

  InputFile inFile = Files.localInput(PARQUET_FILE);

  ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inFile));

  Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
  rowGroupMetadata = reader.getRowGroups().get(0);
  parquetSchema = reader.getFileMetaData().getSchema();
  dictionaryStore = reader.getNextDictionaryReader();

  PARQUET_FILE.deleteOnExit();
}

Example 17

Source File: TestParquetMetrics.java From iceberg with Apache License 2.0

4 votes

@Override
public int splitCount(InputFile inputFile) throws IOException {
  try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inputFile))) {
    return reader.getRowGroups().size();
  }
}

Example 18

Source File: TestMetricsRowGroupFilter.java From iceberg with Apache License 2.0

4 votes

private void createParquetInputFile() throws IOException {
  if (parquetFile.exists()) {
    Assert.assertTrue(parquetFile.delete());
  }

  // build struct field schema
  org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(_structFieldType);

  OutputFile outFile = Files.localOutput(parquetFile);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .build()) {
    GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table"));
    // create 50 records
    for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) {
      builder.set("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0
      builder.set("_no_stats_parquet", TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats
                                                                    // in Parquet
      builder.set("_required", "req"); // required, always non-null
      builder.set("_all_nulls", null); // never non-null
      builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
      builder.set("_no_nulls", ""); // optional, but always non-null
      builder.set("_str", i + "str" + i);

      Record structNotNull = new Record(structSchema);
      structNotNull.put("_int_field", INT_MIN_VALUE + i);
      builder.set("_struct_not_null", structNotNull); // struct with int

      appender.add(builder.build());
    }
  }

  InputFile inFile = Files.localInput(parquetFile);
  try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) {
    Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
    rowGroupMetadata = reader.getRowGroups().get(0);
    parquetSchema = reader.getFileMetaData().getSchema();
  }

  parquetFile.deleteOnExit();
}

Example 19

Source File: AbstractParquetScanBatchCreator.java From Bats with Apache License 2.0

4 votes

private ParquetMetadata readFooter(Configuration conf, Path path, ParquetReaderConfig readerConfig) throws IOException {
  try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path,
    readerConfig.addCountersToConf(conf)), readerConfig.toReadOptions())) {
    return reader.getFooter();
  }
}

Example 20

Source File: SparkModelParser.java From ignite with Apache License 2.0

4 votes

/**
 * Load K-Means model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment learningEnvironment
 */
private static Model loadKMeansModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector[] centers = null;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final int rows = (int)pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            centers = new DenseVector[rows];

            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                // final int clusterIdx = g.getInteger(0, 0);

                Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0);

                final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0);

                centers[i] = new DenseVector(amountOfCoefficients);

                for (int j = 0; j < amountOfCoefficients; j++) {
                    double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0);
                    centers[i].set(j, coefficient);
                }
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new KMeansModel(centers, new EuclideanDistance());
}