Java Code Examples for org.apache.parquet.schema.MessageType#getColumns()

The following examples show how to use org.apache.parquet.schema.MessageType#getColumns() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: ColumnWriteStoreBase.java From parquet-mr with Apache License 2.0

6 votes

ColumnWriteStoreBase(
    MessageType schema,
    PageWriteStore pageWriteStore,
    ParquetProperties props) {
  this.props = props;
  this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO);
  Map<ColumnDescriptor, ColumnWriterBase> mcolumns = new TreeMap<>();
  for (ColumnDescriptor path : schema.getColumns()) {
    PageWriter pageWriter = pageWriteStore.getPageWriter(path);
    mcolumns.put(path, createColumnWriter(path, pageWriter, null, props));
  }
  this.columns = unmodifiableMap(mcolumns);

  this.rowCountForNextSizeCheck = min(props.getMinRowCountForPageSizeCheck(), props.getPageRowCountLimit());

  columnWriterProvider = new ColumnWriterProvider() {
    @Override
    public ColumnWriter getColumnWriter(ColumnDescriptor path) {
      return columns.get(path);
    }
  };
}

Example 2

Source File: ParquetColumnChunkPageWriteStore.java From Bats with Apache License 2.0

5 votes

public ParquetColumnChunkPageWriteStore(BytesCompressor compressor,
                                        MessageType schema,
                                        int initialSlabSize,
                                        int maxCapacityHint,
                                        ByteBufferAllocator allocator) {
  this.schema = schema;
  for (ColumnDescriptor path : schema.getColumns()) {
    writers.put(path,  new ColumnChunkPageWriter(path, compressor, initialSlabSize, maxCapacityHint, allocator));
  }
}

Example 3

Source File: ParquetFilePOJOReaderTest.java From attic-apex-malhar with Apache License 2.0

5 votes

public POJOWriteSupport(MessageType schema, Class<?> klass)
{
  this.schema = schema;
  this.cols = schema.getColumns();
  this.klass = klass;
  init();
}

Example 4

Source File: ColumnWriteStoreBase.java From parquet-mr with Apache License 2.0

5 votes

ColumnWriteStoreBase(
  MessageType schema,
  PageWriteStore pageWriteStore,
  BloomFilterWriteStore bloomFilterWriteStore,
  ParquetProperties props) {
  this.props = props;
  this.thresholdTolerance = (long) (props.getPageSizeThreshold() * THRESHOLD_TOLERANCE_RATIO);
  Map<ColumnDescriptor, ColumnWriterBase> mcolumns = new TreeMap<>();
  for (ColumnDescriptor path : schema.getColumns()) {
    PageWriter pageWriter = pageWriteStore.getPageWriter(path);
    if (props.isBloomFilterEnabled(path)) {
      BloomFilterWriter bloomFilterWriter = bloomFilterWriteStore.getBloomFilterWriter(path);
      mcolumns.put(path, createColumnWriter(path, pageWriter, bloomFilterWriter, props));
    } else {
      mcolumns.put(path, createColumnWriter(path, pageWriter, null, props));
    }
  }
  this.columns = unmodifiableMap(mcolumns);

  this.rowCountForNextSizeCheck = props.getMinRowCountForPageSizeCheck();

  columnWriterProvider = new ColumnWriterProvider() {
    @Override
    public ColumnWriter getColumnWriter(ColumnDescriptor path) {
      return columns.get(path);
    }
  };
}

Example 5

Source File: SchemaCompatibilityValidator.java From parquet-mr with Apache License 2.0

5 votes

private SchemaCompatibilityValidator(MessageType schema) {

    for (ColumnDescriptor cd : schema.getColumns()) {
      ColumnPath columnPath = ColumnPath.get(cd.getPath());
      columnsAccordingToSchema.put(columnPath, cd);
    }
  }

Example 6

Source File: ColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0

5 votes

public ColumnChunkPageWriteStore(BytesCompressor compressor, MessageType schema, ByteBufferAllocator allocator,
    int columnIndexTruncateLength, boolean pageWriteChecksumEnabled) {
  this.schema = schema;
  for (ColumnDescriptor path : schema.getColumns()) {
    writers.put(path, new ColumnChunkPageWriter(path, compressor, allocator, columnIndexTruncateLength, pageWriteChecksumEnabled));
  }
}

Example 7

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

5 votes

public void validate(MessageType schema, PageReadStore store) {
  for (ColumnDescriptor desc : schema.getColumns()) {
    PageReader reader = store.getPageReader(desc);
    DictionaryPage dict = reader.readDictionaryPage();
    DataPage page;
    while ((page = reader.readPage()) != null) {
      validateStatsForPage(page, dict, desc);
    }
  }
}

Example 8

Source File: ParquetReaderUtility.java From Bats with Apache License 2.0

4 votes

/**
 * Check whether any of columns in the given list is either nested or repetitive.
 *
 * @param footer  Parquet file schema
 * @param columns list of query SchemaPath objects
 */
public static boolean containsComplexColumn(ParquetMetadata footer, List<SchemaPath> columns) {

  MessageType schema = footer.getFileMetaData().getSchema();

  if (Utilities.isStarQuery(columns)) {
    for (Type type : schema.getFields()) {
      if (!type.isPrimitive()) {
        return true;
      }
    }
    for (ColumnDescriptor col : schema.getColumns()) {
      if (col.getMaxRepetitionLevel() > 0) {
        return true;
      }
    }
    return false;
  } else {
    Map<String, ColumnDescriptor> colDescMap = ParquetReaderUtility.getColNameToColumnDescriptorMapping(footer);
    Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);

    for (SchemaPath schemaPath : columns) {
      // Schema path which is non-leaf is complex column
      if (!schemaPath.isLeaf()) {
        logger.trace("rowGroupScan contains complex column: {}", schemaPath.getUnIndexed().toString());
        return true;
      }

      // following column descriptor lookup failure may mean two cases, depending on subsequent SchemaElement lookup:
      // 1. success: queried column is complex, i.e. GroupType
      // 2. failure: queried column is not in schema and thus is non-complex
      ColumnDescriptor column = colDescMap.get(schemaPath.getUnIndexed().toString().toLowerCase());

      if (column == null) {
        SchemaElement schemaElement = schemaElements.get(schemaPath.getUnIndexed().toString().toLowerCase());
        if (schemaElement != null) {
          return true;
        }
      } else {
        if (column.getMaxRepetitionLevel() > 0) {
          logger.trace("rowGroupScan contains repetitive column: {}", schemaPath.getUnIndexed().toString());
          return true;
        }
      }
    }
  }
  return false;
}

Example 9

Source File: ColumnIndexValidator.java From parquet-mr with Apache License 2.0

4 votes

public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
  List<ContractViolation> violations = new ArrayList<>();
  try (ParquetFileReader reader = ParquetFileReader.open(file)) {
    FileMetaData meta = reader.getFooter().getFileMetaData();
    MessageType schema = meta.getSchema();
    List<ColumnDescriptor> columns = schema.getColumns();

    List<BlockMetaData> blocks = reader.getFooter().getBlocks();
    int rowGroupNumber = 0;
    PageReadStore rowGroup = reader.readNextRowGroup();
    while (rowGroup != null) {
      ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup,
          new DummyRecordConverter(schema).getRootConverter(), schema, null);
      List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns();
      assert (columnChunks.size() == columns.size());
      for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
        ColumnDescriptor column = columns.get(columnNumber);
        ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
        ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
        if (columnIndex == null) {
          continue;
        }
        ColumnPath columnPath = columnChunk.getPath();
        OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
        List<ByteBuffer> minValues = columnIndex.getMinValues();
        List<ByteBuffer> maxValues = columnIndex.getMaxValues();
        BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
        List<Long> nullCounts = columnIndex.getNullCounts();
        List<Boolean> nullPages = columnIndex.getNullPages();
        long rowNumber = 0;
        ColumnReader columnReader = columnReadStore.getColumnReader(column);
        ByteBuffer prevMinValue = null;
        ByteBuffer prevMaxValue = null;
        for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
          boolean isNullPage = nullPages.get(pageNumber);
          ByteBuffer minValue = minValues.get(pageNumber);
          ByteBuffer maxValue = maxValues.get(pageNumber);
          PageValidator pageValidator = new PageValidator(
              column.getPrimitiveType(),
              rowGroupNumber, columnNumber, columnPath, pageNumber,
              violations, columnReader,
              minValue,
              maxValue,
              prevMinValue,
              prevMaxValue,
              boundaryOrder,
              nullCounts.get(pageNumber),
              isNullPage);
          if (!isNullPage) {
            prevMinValue = minValue;
            prevMaxValue = maxValue;
          }
          long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
          while (rowNumber <= lastRowNumberInPage) {
            pageValidator.validateValuesBelongingToRow();
            ++rowNumber;
          }
          pageValidator.finishPage();
        }
      }
      rowGroup = reader.readNextRowGroup();
      rowGroupNumber++;
    }
  }
  return violations;
}

Example 10

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

4 votes

public void setRequestedSchema(MessageType projection) {
  paths.clear();
  for (ColumnDescriptor col : projection.getColumns()) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
}

Example 11

Source File: TestColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void test() throws Exception {
  Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
  Path root = file.getParent();
  FileSystem fs = file.getFileSystem(conf);
  if (fs.exists(root)) {
    fs.delete(root, true);
  }
  fs.mkdirs(root);
  MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  Encoding dataEncoding = PLAIN;
  int valueCount = 10;
  int d = 1;
  int r = 2;
  int v = 3;
  BytesInput definitionLevels = BytesInput.fromInt(d);
  BytesInput repetitionLevels = BytesInput.fromInt(r);
  Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary"))
      .build();
  BytesInput data = BytesInput.fromInt(v);
  int rowCount = 5;
  int nullCount = 1;
  statistics.incrementNumNulls(nullCount);
  statistics.setMinMaxFromBytes(new byte[] {0, 1, 2}, new byte[] {0, 1, 2, 3});
  long pageOffset;
  long pageSize;

  {
    OutputFileForTesting outputFile = new OutputFileForTesting(file, conf);
    ParquetFileWriter writer = new ParquetFileWriter(outputFile, schema, Mode.CREATE,
        ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.MAX_PADDING_SIZE_DEFAULT);
    writer.start();
    writer.startBlock(rowCount);
    pageOffset = outputFile.out().getPos();
    {
      ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema,
          new HeapByteBufferAllocator(), Integer.MAX_VALUE);
      PageWriter pageWriter = store.getPageWriter(col);
      pageWriter.writePageV2(
          rowCount, nullCount, valueCount,
          repetitionLevels, definitionLevels,
          dataEncoding, data,
          statistics);
      store.flushToFileWriter(writer);
      pageSize = outputFile.out().getPos() - pageOffset;
    }
    writer.endBlock();
    writer.end(new HashMap<String, String>());
  }

  {
    ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
    ParquetFileReader reader = new ParquetFileReader(
        conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
    PageReadStore rowGroup = reader.readNextRowGroup();
    PageReader pageReader = rowGroup.getPageReader(col);
    DataPageV2 page = (DataPageV2)pageReader.readPage();
    assertEquals(rowCount, page.getRowCount());
    assertEquals(nullCount, page.getNullCount());
    assertEquals(valueCount, page.getValueCount());
    assertEquals(d, intValue(page.getDefinitionLevels()));
    assertEquals(r, intValue(page.getRepetitionLevels()));
    assertEquals(dataEncoding, page.getDataEncoding());
    assertEquals(v, intValue(page.getData()));

    // Checking column/offset indexes for the one page
    ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0);
    ColumnIndex columnIndex = reader.readColumnIndex(column);
    assertArrayEquals(statistics.getMinBytes(), columnIndex.getMinValues().get(0).array());
    assertArrayEquals(statistics.getMaxBytes(), columnIndex.getMaxValues().get(0).array());
    assertEquals(statistics.getNumNulls(), columnIndex.getNullCounts().get(0).longValue());
    assertFalse(columnIndex.getNullPages().get(0));
    OffsetIndex offsetIndex = reader.readOffsetIndex(column);
    assertEquals(1, offsetIndex.getPageCount());
    assertEquals(pageSize, offsetIndex.getCompressedPageSize(0));
    assertEquals(0, offsetIndex.getFirstRowIndex(0));
    assertEquals(pageOffset, offsetIndex.getOffset(0));

    reader.close();
  }
}