org.apache.parquet.column.page.PageReadStore Java Examples

The following examples show how to use org.apache.parquet.column.page.PageReadStore. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TupleConsumerPerfTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private static void read(PageReadStore columns, String pigSchemaString, String message) throws ParserException {
    System.out.println(message);
    MessageColumnIO columnIO = newColumnFactory(pigSchemaString);
    TupleReadSupport tupleReadSupport = new TupleReadSupport();
    Map<String, String> pigMetaData = pigMetaData(pigSchemaString);
    MessageType schema = new PigSchemaConverter().convert(Utils.getSchemaFromString(pigSchemaString));
    ReadContext init = tupleReadSupport.init(null, pigMetaData, schema);
    RecordMaterializer<Tuple> recordConsumer = tupleReadSupport.prepareForRead(null, pigMetaData, schema, init);
    RecordReader<Tuple> recordReader = columnIO.getRecordReader(columns, recordConsumer);
    // TODO: put this back
//  if (DEBUG) {
//    recordConsumer = new RecordConsumerLoggingWrapper(recordConsumer);
//  }
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 10000, pigSchemaString);
    read(recordReader, 100000, pigSchemaString);
    read(recordReader, 1000000, pigSchemaString);
    System.out.println();
  }
 
Example #2
Source File: ParquetColumnarRowSplitReader.java    From flink with Apache License 2.0 6 votes vote down vote up
private void readNextRowGroup() throws IOException {
	PageReadStore pages = reader.readNextRowGroup();
	if (pages == null) {
		throw new IOException("expecting more rows but reached last block. Read "
				+ rowsReturned + " out of " + totalRowCount);
	}
	List<ColumnDescriptor> columns = requestedSchema.getColumns();
	columnReaders = new AbstractColumnReader[columns.size()];
	for (int i = 0; i < columns.size(); ++i) {
		columnReaders[i] = createColumnReader(
				utcTimestamp,
				selectedTypes[i],
				columns.get(i),
				pages.getPageReader(columns.get(i)));
	}
	totalCountLoadedSoFar += pages.getRowCount();
}
 
Example #3
Source File: ParquetReader.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private void advance() {
  while (shouldSkip[nextRowGroup]) {
    nextRowGroup += 1;
    reader.skipNextRowGroup();
  }

  PageReadStore pages;
  try {
    pages = reader.readNextRowGroup();
  } catch (IOException e) {
    throw new RuntimeIOException(e);
  }

  nextRowGroupStart += pages.getRowCount();
  nextRowGroup += 1;

  model.setPageSource(pages);
}
 
Example #4
Source File: ParquetResolverTest.java    From pxf with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("deprecation")
private List<Group> readParquetFile(String file, long expectedSize, MessageType schema) throws IOException {
    List<Group> result = new ArrayList<>();
    String parquetFile = Objects.requireNonNull(getClass().getClassLoader().getResource("parquet/" + file)).getPath();
    Path path = new Path(parquetFile);

    ParquetFileReader fileReader = new ParquetFileReader(new Configuration(), path, ParquetMetadataConverter.NO_FILTER);
    PageReadStore rowGroup;
    while ((rowGroup = fileReader.readNextRowGroup()) != null) {
        MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
        RecordReader<Group> recordReader = columnIO.getRecordReader(rowGroup, new GroupRecordConverter(schema));
        long rowCount = rowGroup.getRowCount();
        for (long i = 0; i < rowCount; i++) {
            result.add(recordReader.read());
        }
    }
    fileReader.close();
    assertEquals(expectedSize, result.size());
    return result;
}
 
Example #5
Source File: ParquetReader.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private void advance() {
  while (shouldSkip[nextRowGroup]) {
    nextRowGroup += 1;
    reader.skipNextRowGroup();
  }

  PageReadStore pages;
  try {
    pages = reader.readNextRowGroup();
  } catch (IOException e) {
    throw new RuntimeIOException(e);
  }

  nextRowGroupStart += pages.getRowCount();

  model.setPageSource(pages);
}
 
Example #6
Source File: ColumnReadStoreImpl.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param pageReadStore underlying page storage
 * @param recordConverter the user provided converter to materialize records
 * @param schema the schema we are reading
 * @param createdBy writer version string from the Parquet file being read
 */
public ColumnReadStoreImpl(PageReadStore pageReadStore,
                           GroupConverter recordConverter,
                           MessageType schema, String createdBy) {
  super();
  this.pageReadStore = pageReadStore;
  this.recordConverter = recordConverter;
  this.schema = schema;

  ParsedVersion version;
  try {
    version = VersionParser.parse(createdBy);
  } catch (RuntimeException | VersionParseException e) {
    version = null;
  }
  this.writerVersion = version;
}
 
Example #7
Source File: CompressionConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema,
                           String createdBy, CompressionCodecName codecName) throws IOException {
  int blockIndex = 0;
  PageReadStore store = reader.readNextRowGroup();
  while (store != null) {
    writer.startBlock(store.getRowCount());
    BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex);
    List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns();
    Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(
      Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
      ColumnChunkMetaData chunk = columnsInOrder.get(i);
      ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy);
      ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath());
      writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName);
      processChunk(reader, writer, chunk, createdBy, codecName);
      writer.endColumn();
    }
    writer.endBlock();
    store = reader.readNextRowGroup();
    blockIndex++;
  }
}
 
Example #8
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/** Test that we do not write out checksums if the feature is turned off */
@Test
public void testWriteOffVerifyOff() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    assertCrcNotSet(readNextPage(colADesc, pageReadStore));
    assertCrcNotSet(readNextPage(colADesc, pageReadStore));
    assertCrcNotSet(readNextPage(colBDesc, pageReadStore));
    assertCrcNotSet(readNextPage(colBDesc, pageReadStore));
  }
}
 
Example #9
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Do not write out page level crc checksums, but enable verification on the read path. Tests
 * that the read still succeeds and does not throw an exception.
 */
@Test
public void testWriteOffVerifyOn() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    assertCorrectContent(readNextPage(colADesc, pageReadStore).getBytes().toByteArray(),
      colAPage1Bytes);
    assertCorrectContent(readNextPage(colADesc, pageReadStore).getBytes().toByteArray(),
      colAPage2Bytes);
    assertCorrectContent(readNextPage(colBDesc, pageReadStore).getBytes().toByteArray(),
      colBPage1Bytes);
    assertCorrectContent(readNextPage(colBDesc, pageReadStore).getBytes().toByteArray(),
      colBPage2Bytes);
  }
}
 
Example #10
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static void validatePages(Path file, List<?> expectedValues) throws IOException {
  List<PageReadStore> blockReaders = readBlocksFromFile(file);
  MessageType fileSchema = readSchemaFromFile(file);
  int rowGroupID = 0;
  int rowsRead = 0;
  for (PageReadStore pageReadStore : blockReaders) {
    for (ColumnDescriptor columnsDesc : fileSchema.getColumns()) {
      List<DataPage> pageGroup = getPageGroupForColumn(pageReadStore, columnsDesc);
      DictionaryPage dictPage = reusableCopy(getDictionaryPageForColumn(pageReadStore, columnsDesc));

      List<?> expectedRowGroupValues = expectedValues.subList(rowsRead, (int)(rowsRead + pageReadStore.getRowCount()));
      validateFirstToLast(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues);
      validateLastToFirst(rowGroupID, dictPage, pageGroup, columnsDesc, expectedRowGroupValues);
    }

    rowsRead += pageReadStore.getRowCount();
    rowGroupID++;
  }
}
 
Example #11
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}
 
Example #12
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Moves the reading position to the given block and seeks to and reads the given record.
 *
 * @param block The block to seek to.
 * @param recordInBlock The number of the record in the block to return next.
 */
public void seek(long block, long recordInBlock) throws IOException {

	List<BlockMetaData> blockMetaData = reader.getRowGroups();

	if (block == -1L && recordInBlock == -1L) {
		// the split was fully consumed
		currentBlock = blockMetaData.size() - 1;
		numReadRecords = numTotalRecords;
		numRecordsUpToCurrentBlock = numTotalRecords;
		return;
	}

	// init all counters for the start of the first block
	currentBlock = 0;
	numRecordsUpToPreviousBlock = 0;
	numRecordsUpToCurrentBlock = blockMetaData.get(0).getRowCount();
	numReadRecords = 0;

	// seek to the given block
	while (currentBlock < block) {
		currentBlock++;
		reader.skipNextRowGroup();
		numRecordsUpToPreviousBlock = numRecordsUpToCurrentBlock;
		numRecordsUpToCurrentBlock += blockMetaData.get(currentBlock).getRowCount();
		numReadRecords = numRecordsUpToPreviousBlock;
	}

	// seek to and read the given record
	PageReadStore pages = reader.readNextRowGroup();
	recordReader = createRecordReader(pages);
	for (int i = 0; i <= recordInBlock; i++) {
		readNextRecord();
	}
}
 
Example #13
Source File: ParquetRecordReaderTest.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes)
    throws IOException {
  PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
  DataPageV1 page = (DataPageV1) pageReader.readPage();
  assertEquals(values, page.getValueCount());
  assertArrayEquals(bytes.toByteArray(), page.getBytes().toByteArray());
}
 
Example #14
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static List<PageReadStore> readBlocksFromFile(Path file) throws IOException {
  List<PageReadStore> rowGroups = new ArrayList<PageReadStore>();

  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, file, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader fileReader = new ParquetFileReader(configuration, metadata.getFileMetaData(), file, metadata.getBlocks(),
      metadata.getFileMetaData().getSchema().getColumns());

  PageReadStore group;
  while ((group = fileReader.readNextRowGroup()) != null) {
    rowGroups.add(group);
  }

  return rowGroups;
}
 
Example #15
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static List<DataPage> getPageGroupForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) {
  PageReader pageReader = pageReadStore.getPageReader(columnDescriptor);
  List<DataPage> pageGroup = new ArrayList<DataPage>();

  DataPage page;
  while ((page = pageReader.readPage()) != null) {
    pageGroup.add(reusableCopy(page));
  }

  return pageGroup;
}
 
Example #16
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void validate(MessageType schema, PageReadStore store) {
  for (ColumnDescriptor desc : schema.getColumns()) {
    PageReader reader = store.getPageReader(desc);
    DictionaryPage dict = reader.readDictionaryPage();
    DataPage page;
    while ((page = reader.readPage()) != null) {
      validateStatsForPage(page, dict, desc);
    }
  }
}
 
Example #17
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load Decision Tree model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadDecisionTreeModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, NodeData> nodes = new TreeMap<>();

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));

            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                NodeData nodeData = extractNodeDataFromParquetRow(g);
                nodes.put(nodeData.id, nodeData);
            }
        }
        return buildDecisionTreeModel(nodes);
    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}
 
Example #18
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load linear regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readLinRegInterceptor(g);
                coefficients = readLinRegCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LinearRegressionModel(coefficients, interceptor);
}
 
Example #19
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load logistic regression model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLogRegModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readInterceptor(g);
                coefficients = readCoefficients(g);
            }
        }

    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new LogisticRegressionModel(coefficients, interceptor);
}
 
Example #20
Source File: InternalParquetRecordReader.java    From tajo with Apache License 2.0 5 votes vote down vote up
private void checkRead() throws IOException {
  if (current == totalCountLoadedSoFar) {
    if (current != 0) {
      totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt);
      if (Log.DEBUG) {
        LOG.debug("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
        final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
        if (totalTime != 0) {
          final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
          final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
          LOG.debug("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)");
        }
      }
    }

    if (Log.DEBUG) LOG.debug("at row " + current + ". reading next block");
    long t0 = System.currentTimeMillis();
    PageReadStore pages = reader.readNextRowGroup();
    if (pages == null) {
      throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
    }
    long timeSpentReading = System.currentTimeMillis() - t0;
    totalTimeSpentReadingBytes += timeSpentReading;
    BenchmarkCounter.incrementTime(timeSpentReading);
    if (Log.INFO) LOG.info("block read in memory in " + timeSpentReading + " ms. row count = " + pages.getRowCount());
    if (Log.DEBUG) LOG.debug("initializing Record assembly with requested schema " + requestedSchema);
    MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
    recordReader = columnIO.getRecordReader(pages, recordConverter, filter);
    startedAssemblingCurrentBlockAt = System.currentTimeMillis();
    totalCountLoadedSoFar += pages.getRowCount();
    ++ currentBlock;
  }
}
 
Example #21
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 5 votes vote down vote up
private RecordReader<T> createRecordReader(PageReadStore pages) throws IOException {
	if (pages == null) {
		throw new IOException(
			"Expecting more rows but reached last block. Read " + numReadRecords + " out of " + numTotalRecords);
	}
	MessageColumnIO columnIO = columnIOFactory.getColumnIO(readSchema, fileSchema, true);
	return columnIO.getRecordReader(pages, recordMaterializer, filter);
}
 
Example #22
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Moves the reading position to the given block and seeks to and reads the given record.
 *
 * @param block The block to seek to.
 * @param recordInBlock The number of the record in the block to return next.
 */
public void seek(long block, long recordInBlock) throws IOException {

	List<BlockMetaData> blockMetaData = reader.getRowGroups();

	if (block == -1L && recordInBlock == -1L) {
		// the split was fully consumed
		currentBlock = blockMetaData.size() - 1;
		numReadRecords = numTotalRecords;
		numRecordsUpToCurrentBlock = numTotalRecords;
		return;
	}

	// init all counters for the start of the first block
	currentBlock = 0;
	numRecordsUpToPreviousBlock = 0;
	numRecordsUpToCurrentBlock = blockMetaData.get(0).getRowCount();
	numReadRecords = 0;

	// seek to the given block
	while (currentBlock < block) {
		currentBlock++;
		reader.skipNextRowGroup();
		numRecordsUpToPreviousBlock = numRecordsUpToCurrentBlock;
		numRecordsUpToCurrentBlock += blockMetaData.get(currentBlock).getRowCount();
		numReadRecords = numRecordsUpToPreviousBlock;
	}

	// seek to and read the given record
	PageReadStore pages = reader.readNextRowGroup();
	recordReader = createRecordReader(pages);
	for (int i = 0; i <= recordInBlock; i++) {
		readNextRecord();
	}
}
 
Example #23
Source File: ParquetRecordReader.java    From flink with Apache License 2.0 5 votes vote down vote up
private RecordReader<T> createRecordReader(PageReadStore pages) throws IOException {
	if (pages == null) {
		throw new IOException(
			"Expecting more rows but reached last block. Read " + numReadRecords + " out of " + numTotalRecords);
	}
	MessageColumnIO columnIO = columnIOFactory.getColumnIO(readSchema, fileSchema, true);
	return columnIO.getRecordReader(pages, recordMaterializer, filter);
}
 
Example #24
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testDictionaryEncoding() throws IOException {
  Configuration conf = new Configuration();

  // Write out dictionary encoded sample file via the non-checksum code path, extract the raw
  // bytes to calculate the  reference crc with
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
  Path refPath = writeNestedWithNullsSampleParquetFile(conf, true, CompressionCodecName.SNAPPY);

  try (ParquetFileReader refReader =
    getParquetFileReader(refPath, conf, Collections.singletonList(colDValDesc))) {
    PageReadStore refPageReadStore = refReader.readNextRowGroup();
    // Read (decompressed) dictionary page
    byte[] dictPageBytes = readDictPage(colDValDesc, refPageReadStore).getBytes().toByteArray();
    byte[] colDValPageBytes = readNextPage(colDValDesc, refPageReadStore).getBytes().toByteArray();

    // Write out sample file with checksums
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
    conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
    Path path = writeNestedWithNullsSampleParquetFile(conf, true, CompressionCodecName.SNAPPY);

    try (ParquetFileReader reader =
      getParquetFileReader(path, conf, Collections.singletonList(colDValDesc))) {
      PageReadStore pageReadStore = reader.readNextRowGroup();

      DictionaryPage dictPage = readDictPage(colDValDesc, pageReadStore);
      assertCrcSetAndCorrect(dictPage, snappy(dictPageBytes));
      assertCorrectContent(dictPage.getBytes().toByteArray(), dictPageBytes);

      DataPageV1 colDValPage = readNextPage(colDValDesc, pageReadStore);
      assertCrcSetAndCorrect(colDValPage, snappy(colDValPageBytes));
      assertCorrectContent(colDValPage.getBytes().toByteArray(), colDValPageBytes);
    }
  }
}
 
Example #25
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that we adhere to the checksum calculation specification, namely that the crc is
 * calculated using the compressed concatenation of the repetition levels, definition levels and
 * the actual data. This is done by generating sample data with a nested schema containing nulls
 * (generating non trivial repetition and definition levels).
 */
@Test
public void testNestedWithNulls() throws IOException {
  Configuration conf = new Configuration();

  // Write out sample file via the non-checksum code path, extract the raw bytes to calculate the
  // reference crc with
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, false);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
  Path refPath = writeNestedWithNullsSampleParquetFile(conf, false, CompressionCodecName.SNAPPY);

  try (ParquetFileReader refReader = getParquetFileReader(refPath, conf,
    Arrays.asList(colCIdDesc, colDValDesc))) {
    PageReadStore refPageReadStore = refReader.readNextRowGroup();
    byte[] colCIdPageBytes = readNextPage(colCIdDesc, refPageReadStore).getBytes().toByteArray();
    byte[] colDValPageBytes = readNextPage(colDValDesc, refPageReadStore).getBytes().toByteArray();

    // Write out sample file with checksums
    conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
    conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
    Path path = writeNestedWithNullsSampleParquetFile(conf, false, CompressionCodecName.SNAPPY);

    try (ParquetFileReader reader = getParquetFileReader(path, conf,
      Arrays.asList(colCIdDesc, colDValDesc))) {
      PageReadStore pageReadStore = reader.readNextRowGroup();

      DataPageV1 colCIdPage = readNextPage(colCIdDesc, pageReadStore);
      assertCrcSetAndCorrect(colCIdPage, snappy(colCIdPageBytes));
      assertCorrectContent(colCIdPage.getBytes().toByteArray(), colCIdPageBytes);

      DataPageV1 colDValPage = readNextPage(colDValDesc, pageReadStore);
      assertCrcSetAndCorrect(colDValPage, snappy(colDValPageBytes));
      assertCorrectContent(colDValPage.getBytes().toByteArray(), colDValPageBytes);
    }
  }
}
 
Example #26
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Tests that the checksum is calculated using the compressed version of the data and that
 * checksum verification succeeds
 */
@Test
public void testCompression() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.SNAPPY);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage1, snappy(colAPage1Bytes));
    assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);

    DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage2, snappy(colAPage2Bytes));
    assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);

    DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage1, snappy(colBPage1Bytes));
    assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);

    DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage2, snappy(colBPage2Bytes));
    assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
  }
}
 
Example #27
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Write out checksums and verify them on the read path. Tests that crc is set and that we can
 * read back what we wrote if checksums are enabled on both the write and read path.
 */
@Test
public void testWriteOnVerifyOn() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage1, colAPage1Bytes);
    assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);

    DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage2, colAPage2Bytes);
    assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);

    DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage1, colBPage1Bytes);
    assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);

    DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage2, colBPage2Bytes);
    assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
  }
}
 
Example #28
Source File: InternalParquetRecordReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void checkRead() throws IOException {
  if (current == totalCountLoadedSoFar) {
    if (current != 0) {
      totalTimeSpentProcessingRecords += (System.currentTimeMillis() - startedAssemblingCurrentBlockAt);
      if (LOG.isInfoEnabled()) {
          LOG.info("Assembled and processed " + totalCountLoadedSoFar + " records from " + columnCount + " columns in " + totalTimeSpentProcessingRecords + " ms: "+((float)totalCountLoadedSoFar / totalTimeSpentProcessingRecords) + " rec/ms, " + ((float)totalCountLoadedSoFar * columnCount / totalTimeSpentProcessingRecords) + " cell/ms");
          final long totalTime = totalTimeSpentProcessingRecords + totalTimeSpentReadingBytes;
          if (totalTime != 0) {
              final long percentReading = 100 * totalTimeSpentReadingBytes / totalTime;
              final long percentProcessing = 100 * totalTimeSpentProcessingRecords / totalTime;
              LOG.info("time spent so far " + percentReading + "% reading ("+totalTimeSpentReadingBytes+" ms) and " + percentProcessing + "% processing ("+totalTimeSpentProcessingRecords+" ms)");
          }
      }
    }

    LOG.info("at row " + current + ". reading next block");
    long t0 = System.currentTimeMillis();
    PageReadStore pages = reader.readNextFilteredRowGroup();
    if (pages == null) {
      throw new IOException("expecting more rows but reached last block. Read " + current + " out of " + total);
    }
    long timeSpentReading = System.currentTimeMillis() - t0;
    totalTimeSpentReadingBytes += timeSpentReading;
    BenchmarkCounter.incrementTime(timeSpentReading);
    if (LOG.isInfoEnabled()) LOG.info("block read in memory in {} ms. row count = {}", timeSpentReading, pages.getRowCount());
    LOG.debug("initializing Record assembly with requested schema {}", requestedSchema);
    MessageColumnIO columnIO = columnIOFactory.getColumnIO(requestedSchema, fileSchema, strictTypeChecking);
    recordReader = columnIO.getRecordReader(pages, recordConverter,
        filterRecords ? filter : FilterCompat.NOOP);
    startedAssemblingCurrentBlockAt = System.currentTimeMillis();
    totalCountLoadedSoFar += pages.getRowCount();
    ++ currentBlock;
  }
}
 
Example #29
Source File: TestDataPageV1Checksums.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Enable writing out page level crc checksum, disable verification in read path but check that
 * the crc checksums are correct. Tests whether we successfully write out correct crc checksums
 * without potentially failing on the read path verification .
 */
@Test
public void testWriteOnVerifyOff() throws IOException {
  Configuration conf = new Configuration();
  conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
  conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);

  Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);

  try (ParquetFileReader reader = getParquetFileReader(path, conf,
    Arrays.asList(colADesc, colBDesc))) {
    PageReadStore pageReadStore = reader.readNextRowGroup();

    DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage1, colAPage1Bytes);
    assertCorrectContent(colAPage1.getBytes().toByteArray(), colAPage1Bytes);

    DataPageV1 colAPage2 = readNextPage(colADesc, pageReadStore);
    assertCrcSetAndCorrect(colAPage2, colAPage2Bytes);
    assertCorrectContent(colAPage2.getBytes().toByteArray(), colAPage2Bytes);

    DataPageV1 colBPage1 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage1, colBPage1Bytes);
    assertCorrectContent(colBPage1.getBytes().toByteArray(), colBPage1Bytes);

    DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
    assertCrcSetAndCorrect(colBPage2, colBPage2Bytes);
    assertCorrectContent(colBPage2.getBytes().toByteArray(), colBPage2Bytes);
  }
}
 
Example #30
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load SVM model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadLinearSVMModel(String pathToMdl,
    LearningEnvironment learningEnvironment) {
    Vector coefficients = null;
    double interceptor = 0;

    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));
            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                interceptor = readSVMInterceptor(g);
                coefficients = readSVMCoefficients(g);
            }
        }
    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }

    return new SVMLinearClassificationModel(coefficients, interceptor);
}