Java Code Examples for org.apache.parquet.hadoop.ParquetFileReader#open()

The following examples show how to use org.apache.parquet.hadoop.ParquetFileReader#open() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ProtoParquetWriterWithOffset.java    From garmadon with Apache License 2.0 5 votes vote down vote up
protected void mergeToFinalPath(Path lastAvailableFinalPath, Path finalPath) throws IOException {
    try (ParquetFileReader reader = ParquetFileReader.open(fs.getConf(), lastAvailableFinalPath)) {
        MessageType schema = reader.getFileMetaData().getSchema();
        if (!checkSchemaEquality(schema)) {
            LOGGER.warn("Schema between last available final file ({}) and temp file ({}) are not identical. We can't merge them",
                lastAvailableFinalPath, temporaryHdfsPath);
            moveToFinalPath(temporaryHdfsPath, finalPath);
        } else {
            Path mergedTempFile = new Path(temporaryHdfsPath.toString() + ".merged");

            if (fs.isFile(mergedTempFile)) fs.delete(mergedTempFile, false);

            Map<String, String> existingMetadata = reader.getFileMetaData().getKeyValueMetaData();
            Map<String, String> newMetadata = new HashMap<>(existingMetadata);
            newMetadata.put(LATEST_TIMESTAMP_META_KEY, String.valueOf(latestTimestamp));

            ParquetFileWriter writerPF = new ParquetFileWriter(fs.getConf(), schema, mergedTempFile);
            writerPF.start();
            try (
                ParquetFileReader dest = ParquetFileReader.open(fs.getConf(), lastAvailableFinalPath);
                ParquetFileReader temp = ParquetFileReader.open(fs.getConf(), temporaryHdfsPath)
            ) {
                dest.appendTo(writerPF);
                temp.appendTo(writerPF);
                writerPF.end(newMetadata);
            }

            moveToFinalPath(mergedTempFile, lastAvailableFinalPath);
            try {
                fs.delete(temporaryHdfsPath, false);
                // This file is in a temp folder that should be deleted at exit so we should not throw exception here
            } catch (IOException ignored) {
            }
        }
    }
}
 
Example 2
Source File: TestDictionaryRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@BeforeClass
public static void createInputFile() throws IOException {
  if (PARQUET_FILE.exists()) {
    Assert.assertTrue(PARQUET_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(PARQUET_FILE);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .build()) {
    GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table"));
    // create 20 copies of each record to ensure dictionary-encoding
    for (int copy = 0; copy < 20; copy += 1) {
      // create 50 records
      for (int i = 0; i < 50; i += 1) {
        builder.set("_id", 30 + i); // min=30, max=79, num-nulls=0
        builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats
        builder.set("_required", "req"); // required, always non-null
        builder.set("_all_nulls", null); // never non-null
        builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
        builder.set("_no_nulls", ""); // optional, but always non-null
        builder.set("_non_dict", UUID.randomUUID().toString()); // not dictionary-encoded
        appender.add(builder.build());
      }
    }
  }

  InputFile inFile = Files.localInput(PARQUET_FILE);

  ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inFile));

  Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
  ROW_GROUP_METADATA = reader.getRowGroups().get(0);
  PARQUET_SCHEMA = reader.getFileMetaData().getSchema();
  DICTIONARY_STORE = reader.getNextDictionaryReader();

  PARQUET_FILE.deleteOnExit();
}
 
Example 3
Source File: TestMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@BeforeClass
public static void createInputFile() throws IOException {
  if (PARQUET_FILE.exists()) {
    Assert.assertTrue(PARQUET_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(PARQUET_FILE);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .build()) {
    GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table"));
    // create 50 records
    for (int i = 0; i < 50; i += 1) {
      builder.set("_id", 30 + i); // min=30, max=79, num-nulls=0
      builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats
      builder.set("_required", "req"); // required, always non-null
      builder.set("_all_nulls", null); // never non-null
      builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
      builder.set("_no_nulls", ""); // optional, but always non-null
      appender.add(builder.build());
    }
  }

  InputFile inFile = Files.localInput(PARQUET_FILE);
  try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inFile))) {
    Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
    ROW_GROUP_METADATA = reader.getRowGroups().get(0);
    PARQUET_SCHEMA = reader.getFileMetaData().getSchema();
  }

  PARQUET_FILE.deleteOnExit();
}
 
Example 4
Source File: ParquetReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static ParquetFileReader newReader(InputFile file, ParquetReadOptions options) {
  try {
    return ParquetFileReader.open(ParquetIO.file(file), options);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to open Parquet file: %s", file.location());
  }
}
 
Example 5
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load Decision Tree model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadDecisionTreeModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, NodeData> nodes = new TreeMap<>();

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));

            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                NodeData nodeData = extractNodeDataFromParquetRow(g);
                nodes.put(nodeData.id, nodeData);
            }
        }
        return buildDecisionTreeModel(nodes);
    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}
 
Example 6
Source File: ParquetFileLineFetcher.java    From hugegraph-loader with Apache License 2.0 5 votes vote down vote up
@Override
public void openReader(Readable readable) {
    Path path = new Path(this.source().path());
    try {
        HadoopInputFile file = HadoopInputFile.fromPath(path, this.conf);
        this.reader = ParquetFileReader.open(file);
        this.schema = this.reader.getFooter().getFileMetaData().getSchema();
        this.columnIO = new ColumnIOFactory().getColumnIO(this.schema);
    } catch (IOException e) {
        throw new LoadException("Failed to open parquet reader for '%s'",
                                e, readable);
    }
    this.resetOffset();
}
 
Example 7
Source File: DataLoad.java    From arvo2parquet with MIT License 5 votes vote down vote up
private static void extractMetaDataFooter(final Path parquetFilePath) throws IOException {
  try (final ParquetFileReader rdr = ParquetFileReader.open(nioPathToInputFile(parquetFilePath))) {
    final ParquetMetadata footer = rdr.getFooter();
    final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE + "_dup.parquet");
    Files.deleteIfExists(metaDataOutPath);
    try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) {
      serializeFooter(footer, out);
    }
  }
}
 
Example 8
Source File: DictionaryFilterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() throws Exception {
  reader = ParquetFileReader.open(conf, file);
  ParquetMetadata meta = reader.getFooter();
  ccmd = meta.getBlocks().get(0).getColumns();
  dictionaries = reader.getDictionaryReader(meta.getBlocks().get(0));
}
 
Example 9
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load logistic regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLogRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readInterceptor(g);
                coefficients = readCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LogisticRegressionModel(coefficients, interceptor);
}
 
Example 10
Source File: TestParquet.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Test
public void testRowGroupSizeConfigurable() throws IOException {
  // Without an explicit writer function
  File parquetFile = generateFileWithTwoRowGroups(null);

  try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(localInput(parquetFile)))) {
    Assert.assertEquals(2, reader.getRowGroups().size());
  }
}
 
Example 11
Source File: ParquetUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static Metrics fileMetrics(InputFile file, MetricsConfig metricsConfig) {
  try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(file))) {
    return footerMetrics(reader.getFooter(), metricsConfig);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to read footer of file: %s", file);
  }
}
 
Example 12
Source File: ReadConf.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static ParquetFileReader newReader(InputFile file, ParquetReadOptions options) {
  try {
    return ParquetFileReader.open(ParquetIO.file(file), options);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to open Parquet file: %s", file.location());
  }
}
 
Example 13
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load linear regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readLinRegInterceptor(g);
                coefficients = readLinRegCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LinearRegressionModel(coefficients, interceptor);
}
 
Example 14
Source File: ProtoParquetWriterWithOffset.java    From garmadon with Apache License 2.0 5 votes vote down vote up
private double getLatestCommittedTimestamp() {
    //there are cases for which we won't find a value for the latest committed timestamp
    // - the first time this code goes in, no file has the correct metadata
    // - for a new event type, we have no history too, so no value
    //By using the default value 'now' rather than 0, we prevent firing unnecessary alerts
    //However, if there is an actual problem and the reader never commits, it will eventually fire
    //an alert.
    long defaultValue = System.currentTimeMillis();
    try {
        Optional<Path> latestFileCommitted = getLastestExistingFinalPath();
        if (latestFileCommitted.isPresent()) {
            try (ParquetFileReader pfr = ParquetFileReader.open(fs.getConf(), latestFileCommitted.get())) {
                String timestamp = pfr
                        .getFooter()
                        .getFileMetaData()
                        .getKeyValueMetaData()
                        .getOrDefault(LATEST_TIMESTAMP_META_KEY, String.valueOf(defaultValue));
                return Double.valueOf(timestamp);
            }
        } else {
            return defaultValue;
        }
    } catch (IOException e) {
        LOGGER.warn("could not get last existing final path. Defaulting latest committed timestamp to 0");
        return defaultValue;
    }
}
 
Example 15
Source File: ProtoParquetWriterWithOffset.java    From garmadon with Apache License 2.0 5 votes vote down vote up
private boolean checkSchemaEquality(MessageType schema) throws IOException {
    try (ParquetFileReader pfr = ParquetFileReader.open(fs.getConf(), temporaryHdfsPath)) {
        MessageType schema2 = pfr.getFileMetaData().getSchema();

        return schema.equals(schema2);
    }
}
 
Example 16
Source File: TestDictionaryRowGroupFilter.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@BeforeClass
public static void createInputFile() throws IOException {
  if (PARQUET_FILE.exists()) {
    Assert.assertTrue(PARQUET_FILE.delete());
  }

  // build struct field schema
  org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(_structFieldType);

  OutputFile outFile = Files.localOutput(PARQUET_FILE);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .build()) {
    GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table"));
    // create 20 copies of each record to ensure dictionary-encoding
    for (int copy = 0; copy < 20; copy += 1) {
      // create 50 records
      for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) {
        builder.set("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0
        builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats
        builder.set("_required", "req"); // required, always non-null
        builder.set("_all_nulls", null); // never non-null
        builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
        builder.set("_no_nulls", ""); // optional, but always non-null
        builder.set("_non_dict", UUID.randomUUID().toString()); // not dictionary-encoded

        Record structNotNull = new Record(structSchema);
        structNotNull.put("_int_field", INT_MIN_VALUE + i);
        builder.set("_struct_not_null", structNotNull); // struct with int

        appender.add(builder.build());
      }
    }
  }

  InputFile inFile = Files.localInput(PARQUET_FILE);

  ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inFile));

  Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
  rowGroupMetadata = reader.getRowGroups().get(0);
  parquetSchema = reader.getFileMetaData().getSchema();
  dictionaryStore = reader.getNextDictionaryReader();

  PARQUET_FILE.deleteOnExit();
}
 
Example 17
Source File: TestParquetMetrics.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public int splitCount(InputFile inputFile) throws IOException {
  try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inputFile))) {
    return reader.getRowGroups().size();
  }
}
 
Example 18
Source File: TestMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private void createParquetInputFile() throws IOException {
  if (parquetFile.exists()) {
    Assert.assertTrue(parquetFile.delete());
  }

  // build struct field schema
  org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(_structFieldType);

  OutputFile outFile = Files.localOutput(parquetFile);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .build()) {
    GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table"));
    // create 50 records
    for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) {
      builder.set("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0
      builder.set("_no_stats_parquet", TOO_LONG_FOR_STATS_PARQUET); // value longer than 4k will produce no stats
                                                                    // in Parquet
      builder.set("_required", "req"); // required, always non-null
      builder.set("_all_nulls", null); // never non-null
      builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
      builder.set("_no_nulls", ""); // optional, but always non-null
      builder.set("_str", i + "str" + i);

      Record structNotNull = new Record(structSchema);
      structNotNull.put("_int_field", INT_MIN_VALUE + i);
      builder.set("_struct_not_null", structNotNull); // struct with int

      appender.add(builder.build());
    }
  }

  InputFile inFile = Files.localInput(parquetFile);
  try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) {
    Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
    rowGroupMetadata = reader.getRowGroups().get(0);
    parquetSchema = reader.getFileMetaData().getSchema();
  }

  parquetFile.deleteOnExit();
}
 
Example 19
Source File: AbstractParquetScanBatchCreator.java    From Bats with Apache License 2.0 4 votes vote down vote up
private ParquetMetadata readFooter(Configuration conf, Path path, ParquetReaderConfig readerConfig) throws IOException {
  try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path,
    readerConfig.addCountersToConf(conf)), readerConfig.toReadOptions())) {
    return reader.getFooter();
  }
}
 
Example 20
Source File: SparkModelParser.java    From ignite with Apache License 2.0 4 votes vote down vote up
/**
 * Load K-Means model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment learningEnvironment
 */
private static Model loadKMeansModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector[] centers = null;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;
        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final int rows = (int)pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            centers = new DenseVector[rows];

            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                // final int clusterIdx = g.getInteger(0, 0);

                Group clusterCenterCoeff = g.getGroup(1, 0).getGroup(3, 0);

                final int amountOfCoefficients = clusterCenterCoeff.getFieldRepetitionCount(0);

                centers[i] = new DenseVector(amountOfCoefficients);

                for (int j = 0; j < amountOfCoefficients; j++) {
                    double coefficient = clusterCenterCoeff.getGroup(0, j).getDouble(0, 0);
                    centers[i].set(j, coefficient);
                }
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new KMeansModel(centers, new EuclideanDistance());
}