org.apache.parquet.hadoop.metadata.ColumnPath Java Examples

The following examples show how to use org.apache.parquet.hadoop.metadata.ColumnPath. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestFilterApiMethods.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilterPredicateCreation() {
  FilterPredicate outerAnd = predicate;

  assertTrue(outerAnd instanceof And);

  FilterPredicate not = ((And) outerAnd).getLeft();
  FilterPredicate gt = ((And) outerAnd).getRight();
  assertTrue(not instanceof Not);

  FilterPredicate or = ((Not) not).getPredicate();
  assertTrue(or instanceof Or);

  FilterPredicate leftEq = ((Or) or).getLeft();
  FilterPredicate rightNotEq = ((Or) or).getRight();
  assertTrue(leftEq instanceof Eq);
  assertTrue(rightNotEq instanceof NotEq);
  assertEquals(7, ((Eq) leftEq).getValue());
  assertEquals(17, ((NotEq) rightNotEq).getValue());
  assertEquals(ColumnPath.get("a", "b", "c"), ((Eq) leftEq).getColumn().getColumnPath());
  assertEquals(ColumnPath.get("a", "b", "c"), ((NotEq) rightNotEq).getColumn().getColumnPath());

  assertTrue(gt instanceof Gt);
  assertEquals(100.0, ((Gt) gt).getValue());
  assertEquals(ColumnPath.get("x", "y", "z"), ((Gt) gt).getColumn().getColumnPath());
}
 
Example #2
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * start a column inside a block
 * @param descriptor the column descriptor
 * @param valueCount the value count in this column
 * @param compressionCodecName a compression codec name
 * @throws IOException if there is an error while writing
 */
public void startColumn(ColumnDescriptor descriptor,
                        long valueCount,
                        CompressionCodecName compressionCodecName) throws IOException {
  state = state.startColumn();
  encodingStatsBuilder.clear();
  currentEncodings = new HashSet<Encoding>();
  currentChunkPath = ColumnPath.get(descriptor.getPath());
  currentChunkType = descriptor.getPrimitiveType();
  currentChunkCodec = compressionCodecName;
  currentChunkValueCount = valueCount;
  currentChunkFirstDataPage = out.getPos();
  compressedLength = 0;
  uncompressedLength = 0;
  // The statistics will be copied from the first one added at writeDataPage(s) so we have the correct typed one
  currentStatistics = null;

  columnIndexBuilder = ColumnIndexBuilder.getBuilder(currentChunkType, columnIndexTruncateLength);
  offsetIndexBuilder = OffsetIndexBuilder.getBuilder();
  firstPageOffset = -1;
}
 
Example #3
Source File: CompressionConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema,
                           String createdBy, CompressionCodecName codecName) throws IOException {
  int blockIndex = 0;
  PageReadStore store = reader.readNextRowGroup();
  while (store != null) {
    writer.startBlock(store.getRowCount());
    BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex);
    List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns();
    Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(
      Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
      ColumnChunkMetaData chunk = columnsInOrder.get(i);
      ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy);
      ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath());
      writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName);
      processChunk(reader, writer, chunk, createdBy, codecName);
      writer.endColumn();
    }
    writer.endBlock();
    store = reader.readNextRowGroup();
    blockIndex++;
  }
}
 
Example #4
Source File: PruneColumnsCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private Type pruneColumnsInField(Type field, List<String> currentPath, Set<ColumnPath> prunePaths) {
  String fieldName = field.getName();
  currentPath.add(fieldName);
  ColumnPath path = ColumnPath.get(currentPath.toArray(new String[0]));
  Type prunedField = null;
  if (!prunePaths.contains(path)) {
    if (field.isPrimitive()) {
      prunedField = field;
    } else {
      List<Type> childFields = ((GroupType) field).getFields();
      List<Type> prunedFields = pruneColumnsInFields(childFields, currentPath, prunePaths);
      if (prunedFields.size() > 0) {
        prunedField = ((GroupType) field).withNewFields(prunedFields);
      }
    } 
  }

  currentPath.remove(fieldName);
  return prunedField;
}
 
Example #5
Source File: FilteringRecordMaterializer.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public FilteringRecordMaterializer(
    RecordMaterializer<T> delegate,
    List<PrimitiveColumnIO> columnIOs,
    Map<ColumnPath, List<ValueInspector>> valueInspectorsByColumn,
    IncrementallyUpdatedFilterPredicate filterPredicate) {

  Objects.requireNonNull(columnIOs, "columnIOs cannot be null");
  Objects.requireNonNull(valueInspectorsByColumn, "valueInspectorsByColumn cannot be null");
  this.filterPredicate = Objects.requireNonNull(filterPredicate, "filterPredicate cannot be null");
  this.delegate = Objects.requireNonNull(delegate, "delegate cannot be null");

  // keep track of which path of indices leads to which primitive column
  Map<List<Integer>, PrimitiveColumnIO> columnIOsByIndexFieldPath = new HashMap<>();

  for (PrimitiveColumnIO c : columnIOs) {
    List<Integer> indexFieldPath = Arrays.stream(c.getIndexFieldPath())
        .boxed().collect(Collectors.toList());
    columnIOsByIndexFieldPath.put(indexFieldPath, c);
  }

  // create a proxy for the delegate's root converter
  this.rootConverter = new FilteringGroupConverter(
      delegate.getRootConverter(), Collections.emptyList(),
      valueInspectorsByColumn, columnIOsByIndexFieldPath);
}
 
Example #6
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param configuration the Hadoop conf
 * @param fileMetaData fileMetaData for parquet file
 * @param filePath Path for the parquet file
 * @param blocks the blocks to read
 * @param columns the columns to read (their path)
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(
    Configuration configuration, FileMetaData fileMetaData,
    Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException {
  this.converter = new ParquetMetadataConverter(configuration);
  this.file = HadoopInputFile.fromPath(filePath, configuration);
  this.fileMetaData = fileMetaData;
  this.f = file.newStream();
  this.options = HadoopReadOptions.builder(configuration).build();
  this.blocks = filterRowGroups(blocks);
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : columns) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}
 
Example #7
Source File: FilteringGroupConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public Converter getConverter(int fieldIndex) {

  // get the real converter from the delegate
  Converter delegateConverter = Objects.requireNonNull(delegate.getConverter(fieldIndex), "delegate converter cannot be null");

  // determine the indexFieldPath for the converter proxy we're about to make, which is
  // this converter's path + the requested fieldIndex
  List<Integer> newIndexFieldPath = new ArrayList<>(indexFieldPath.size() + 1);
  newIndexFieldPath.addAll(indexFieldPath);
  newIndexFieldPath.add(fieldIndex);

  if (delegateConverter.isPrimitive()) {
    PrimitiveColumnIO columnIO = getColumnIO(newIndexFieldPath);
    ColumnPath columnPath = ColumnPath.get(columnIO.getColumnDescriptor().getPath());
    ValueInspector[] valueInspectors = getValueInspectors(columnPath);
    return new FilteringPrimitiveConverter(delegateConverter.asPrimitiveConverter(), valueInspectors);
  } else {
    return new FilteringGroupConverter(delegateConverter.asGroupConverter(), newIndexFieldPath, valueInspectorsByColumn, columnIOsByIndexFieldPath);
  }

}
 
Example #8
Source File: TestColumnIndexFilter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilteringWithAllNullPages() {
  Set<ColumnPath> paths = paths("column1", "column5");

  assertAllRows(calculateRowRanges(FilterCompat.get(
      notEq(longColumn("column5"), 1234567L)),
      STORE, paths, TOTAL_ROW_COUNT),
      TOTAL_ROW_COUNT);
  assertAllRows(calculateRowRanges(FilterCompat.get(
      or(gtEq(intColumn("column1"), 10),
          notEq(longColumn("column5"), 1234567L))),
      STORE, paths, TOTAL_ROW_COUNT),
      TOTAL_ROW_COUNT);
  assertRows(calculateRowRanges(FilterCompat.get(
      eq(longColumn("column5"), 1234567L)),
      STORE, paths, TOTAL_ROW_COUNT));
  assertRows(calculateRowRanges(FilterCompat.get(
      and(lt(intColumn("column1"), 20),
          gtEq(longColumn("column5"), 1234567L))),
      STORE, paths, TOTAL_ROW_COUNT));
}
 
Example #9
Source File: ReadConf.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private List<Map<ColumnPath, ColumnChunkMetaData>> getColumnChunkMetadataForRowGroups() {
  Set<ColumnPath> projectedColumns = projection.getColumns().stream()
      .map(columnDescriptor -> ColumnPath.get(columnDescriptor.getPath())).collect(Collectors.toSet());
  ImmutableList.Builder<Map<ColumnPath, ColumnChunkMetaData>> listBuilder = ImmutableList.builder();
  for (int i = 0; i < rowGroups.size(); i++) {
    if (!shouldSkip[i]) {
      BlockMetaData blockMetaData = rowGroups.get(i);
      ImmutableMap.Builder<ColumnPath, ColumnChunkMetaData> mapBuilder = ImmutableMap.builder();
      blockMetaData.getColumns().stream()
          .filter(columnChunkMetaData -> projectedColumns.contains(columnChunkMetaData.getPath()))
          .forEach(columnChunkMetaData -> mapBuilder.put(columnChunkMetaData.getPath(), columnChunkMetaData));
      listBuilder.add(mapBuilder.build());
    } else {
      listBuilder.add(ImmutableMap.of());
    }
  }
  return listBuilder.build();
}
 
Example #10
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException {
  this.converter = new ParquetMetadataConverter(options);
  this.file = file;
  this.f = file.newStream();
  this.options = options;
  try {
    this.footer = readFooter(file, options, f, converter);
  } catch (Exception e) {
    // In case that reading footer throws an exception in the constructor, the new stream
    // should be closed. Otherwise, there's no way to close this outside.
    f.close();
    throw e;
  }
  this.fileMetaData = footer.getFileMetaData();
  this.blocks = filterRowGroups(footer.getBlocks());
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}
 
Example #11
Source File: ColumnIndexFilter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Calculates the row ranges containing the indexes of the rows might match the specified filter.
 *
 * @param filter
 *          to be used for filtering the rows
 * @param columnIndexStore
 *          the store for providing column/offset indexes
 * @param paths
 *          the paths of the columns used in the actual projection; a column not being part of the projection will be
 *          handled as containing {@code null} values only even if the column has values written in the file
 * @param rowCount
 *          the total number of rows in the row-group
 * @return the ranges of the possible matching row indexes; the returned ranges will contain all the rows if any of
 *         the required offset index is missing
 */
public static RowRanges calculateRowRanges(FilterCompat.Filter filter, ColumnIndexStore columnIndexStore,
    Set<ColumnPath> paths, long rowCount) {
  return filter.accept(new FilterCompat.Visitor<RowRanges>() {
    @Override
    public RowRanges visit(FilterPredicateCompat filterPredicateCompat) {
      try {
        return filterPredicateCompat.getFilterPredicate()
            .accept(new ColumnIndexFilter(columnIndexStore, paths, rowCount));
      } catch (MissingOffsetIndexException e) {
        LOGGER.info(e.getMessage());
        return RowRanges.createSingle(rowCount);
      }
    }

    @Override
    public RowRanges visit(UnboundRecordFilterCompat unboundRecordFilterCompat) {
      return RowRanges.createSingle(rowCount);
    }

    @Override
    public RowRanges visit(NoOpFilter noOpFilter) {
      return RowRanges.createSingle(rowCount);
    }
  });
}
 
Example #12
Source File: TestColumnIndexFilter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public OffsetIndex getOffsetIndex(ColumnPath column) {
  switch (column.toDotString()) {
    case "column1":
      return COLUMN1_OI;
    case "column2":
      return COLUMN2_OI;
    case "column3":
      return COLUMN3_OI;
    case "column4":
      return COLUMN4_OI;
    case "column5":
      return COLUMN5_OI;
    default:
      throw new MissingOffsetIndexException(column);
  }
}
 
Example #13
Source File: TestColumnIndexFilter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public ColumnIndex getColumnIndex(ColumnPath column) {
  switch (column.toDotString()) {
    case "column1":
      return COLUMN1_CI;
    case "column2":
      return COLUMN2_CI;
    case "column3":
      return COLUMN3_CI;
    case "column4":
      return COLUMN4_CI;
    case "column5":
      return COLUMN5_CI;
    default:
      return null;
  }
}
 
Example #14
Source File: TestStatisticsFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static ColumnChunkMetaData getIntColumnMeta(org.apache.parquet.column.statistics.Statistics<?> stats,
    long valueCount) {
  return ColumnChunkMetaData.get(ColumnPath.get("int", "column"),
      PrimitiveTypeName.INT32,
      CompressionCodecName.GZIP,
      new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
      stats,
      0L, 0L, valueCount, 0L, 0L);
}
 
Example #15
Source File: TestInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static BlockMetaData makeBlockFromStats(IntStatistics stats, long valueCount) {
  BlockMetaData blockMetaData = new BlockMetaData();

  ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
      PrimitiveTypeName.INT32,
      CompressionCodecName.GZIP,
      new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
      stats,
      100l, 100l, valueCount, 100l, 100l);
  blockMetaData.addColumn(column);
  blockMetaData.setTotalByteSize(200l);
  blockMetaData.setRowCount(valueCount);
  return blockMetaData;
}
 
Example #16
Source File: ParquetUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static boolean shouldStoreBounds(ColumnPath columnPath, Schema schema) {
  Iterator<String> pathIterator = columnPath.iterator();
  Type currentType = schema.asStruct();

  while (pathIterator.hasNext()) {
    if (currentType == null || !currentType.isStructType()) {
      return false;
    }
    String fieldName = pathIterator.next();
    currentType = currentType.asStructType().fieldType(fieldName);
  }

  return currentType != null && currentType.isPrimitiveType();
}
 
Example #17
Source File: IncrementallyUpdatedFilterPredicateBuilderBase.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public IncrementallyUpdatedFilterPredicateBuilderBase(List<PrimitiveColumnIO> leaves) {
  for (PrimitiveColumnIO leaf : leaves) {
    ColumnDescriptor descriptor = leaf.getColumnDescriptor();
    ColumnPath path = ColumnPath.get(descriptor.getPath());
    PrimitiveComparator<?> comparator = descriptor.getPrimitiveType().comparator();
    comparatorsByColumn.put(path, comparator);
  }
}
 
Example #18
Source File: VectorizedArrowReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public void setRowGroupInfo(PageReadStore source, Map<ColumnPath, ColumnChunkMetaData> metadata) {
  ColumnChunkMetaData chunkMetaData = metadata.get(ColumnPath.get(columnDescriptor.getPath()));
  this.dictionary = vectorizedColumnIterator.setRowGroupInfo(
      source.getPageReader(columnDescriptor),
      !ParquetUtil.hasNonDictionaryPages(chunkMetaData));
}
 
Example #19
Source File: ColumnarBatchReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public final void setRowGroupInfo(PageReadStore pageStore, Map<ColumnPath, ColumnChunkMetaData> metaData) {
  for (VectorizedArrowReader reader : readers) {
    if (reader != null) {
      reader.setRowGroupInfo(pageStore, metaData);
    }
  }
}
 
Example #20
Source File: TestInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private BlockMetaData newBlock(long start, long compressedBlockSize) {
  BlockMetaData blockMetaData = new BlockMetaData();
  long uncompressedSize = compressedBlockSize * 2;//assuming the compression ratio is 2
  ColumnChunkMetaData column = ColumnChunkMetaData.get(ColumnPath.get("foo"),
                                                       PrimitiveTypeName.BINARY,
                                                       CompressionCodecName.GZIP,
                                                       new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
                                                       new BinaryStatistics(),
                                                       start, 0l, 0l, compressedBlockSize, uncompressedSize);
  blockMetaData.addColumn(column);
  blockMetaData.setTotalByteSize(uncompressedSize);
  return blockMetaData;
}
 
Example #21
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static ParquetMetadata createParquetMetaData(Encoding dicEncoding,
  Encoding dataEncoding) {
  MessageType schema =
    parseMessageType("message schema { optional int32 col (INT_32); }");
  org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData =
    new org.apache.parquet.hadoop.metadata.FileMetaData(schema,
      new HashMap<String, String>(), null);
  List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
  BlockMetaData blockMetaData = new BlockMetaData();
  EncodingStats.Builder builder = new EncodingStats.Builder();
  if (dicEncoding!= null) {
    builder.addDictEncoding(dicEncoding).build();
  }
  builder.addDataEncoding(dataEncoding);
  EncodingStats es = builder.build();
  Set<org.apache.parquet.column.Encoding> e =
    new HashSet<org.apache.parquet.column.Encoding>();
  PrimitiveTypeName t = PrimitiveTypeName.INT32;
  ColumnPath p = ColumnPath.get("col");
  CompressionCodecName c = CompressionCodecName.UNCOMPRESSED;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md =
    ColumnChunkMetaData.get(p, t, c, es, e, s, 20, 30, 0, 0, 0);
  blockMetaData.addColumn(md);
  blockMetaDataList.add(blockMetaData);
  return new ParquetMetadata(fileMetaData, blockMetaDataList);
}
 
Example #22
Source File: IncrementallyUpdatedFilterPredicateBuilderBase.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
protected final void addValueInspector(ColumnPath columnPath, ValueInspector valueInspector) {
  List<ValueInspector> valueInspectors = valueInspectorsByColumn.get(columnPath);
  if (valueInspectors == null) {
    valueInspectors = new ArrayList<>();
    valueInspectorsByColumn.put(columnPath, valueInspectors);
  }
  valueInspectors.add(valueInspector);
}
 
Example #23
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ColumnChunkMetaData createColumnChunkMetaData() {
  Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
  PrimitiveTypeName t = PrimitiveTypeName.BINARY;
  ColumnPath p = ColumnPath.get("foo");
  CompressionCodecName c = CompressionCodecName.GZIP;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s,
          0, 0, 0, 0, 0);
  return md;
}
 
Example #24
Source File: ColumnIndexStoreImpl.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
static ColumnIndexStore create(ParquetFileReader reader, BlockMetaData block, Set<ColumnPath> paths) {
  try {
    return new ColumnIndexStoreImpl(reader, block, paths);
  } catch (MissingOffsetIndexException e) {
    return EMPTY;
  }
}
 
Example #25
Source File: FilteringGroupConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public FilteringGroupConverter(
    GroupConverter delegate,
    List<Integer> indexFieldPath,
    Map<ColumnPath, List<ValueInspector>> valueInspectorsByColumn, Map<List<Integer>,
    PrimitiveColumnIO> columnIOsByIndexFieldPath) {

  this.delegate = Objects.requireNonNull(delegate, "delegate cannot be null");
  this.indexFieldPath = Objects.requireNonNull(indexFieldPath, "indexFieldPath cannot be null");
  this.columnIOsByIndexFieldPath = Objects.requireNonNull(columnIOsByIndexFieldPath, "columnIOsByIndexFieldPath cannot be null");
  this.valueInspectorsByColumn = Objects.requireNonNull(valueInspectorsByColumn, "valueInspectorsByColumn cannot be null");
}
 
Example #26
Source File: SchemaCompatibilityValidator.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private SchemaCompatibilityValidator(MessageType schema) {

    for (ColumnDescriptor cd : schema.getColumns()) {
      ColumnPath columnPath = ColumnPath.get(cd.getPath());
      columnsAccordingToSchema.put(columnPath, cd);
    }
  }
 
Example #27
Source File: SchemaCompatibilityValidator.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private <T extends Comparable<T>> void validateColumn(Column<T> column) {
  ColumnPath path = column.getColumnPath();

  Class<?> alreadySeen = columnTypesEncountered.get(path);
  if (alreadySeen != null && !alreadySeen.equals(column.getColumnType())) {
    throw new IllegalArgumentException("Column: "
        + path.toDotString()
        + " was provided with different types in the same predicate."
        + " Found both: (" + alreadySeen + ", " + column.getColumnType() + ")");
  }

  if (alreadySeen == null) {
    columnTypesEncountered.put(path, column.getColumnType());
  }

  ColumnDescriptor descriptor = getColumnDescriptor(path);
  if (descriptor == null) {
    // the column is missing from the schema. evaluation uses calls
    // updateNull() a value is missing, so this will be handled correctly.
    return;
  }

  if (descriptor.getMaxRepetitionLevel() > 0) {
    throw new IllegalArgumentException("FilterPredicates do not currently support repeated columns. "
        + "Column " + path.toDotString() + " is repeated.");
  }

  ValidTypeMap.assertTypeValid(column, descriptor.getType());
}
 
Example #28
Source File: TestColumnIndexFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testFilteringOnMissingColumns() {
  Set<ColumnPath> paths = paths("column1", "column2", "column3", "column4");

  // Missing column filter is always true
  assertAllRows(calculateRowRanges(FilterCompat.get(
      notEq(intColumn("missing_column"), 0)),
      STORE, paths, TOTAL_ROW_COUNT),
      TOTAL_ROW_COUNT);
  assertRows(calculateRowRanges(FilterCompat.get(
      and(
          and(
              gtEq(intColumn("column1"), 7),
              lt(intColumn("column1"), 11)),
          eq(binaryColumn("missing_column"), null))),
      STORE, paths, TOTAL_ROW_COUNT),
      7, 8, 9, 10, 11, 12, 13);

  // Missing column filter is always false
  assertRows(calculateRowRanges(FilterCompat.get(
      or(
          and(
              gtEq(intColumn("column1"), 7),
              lt(intColumn("column1"), 11)),
          notEq(binaryColumn("missing_column"), null))),
      STORE, paths, TOTAL_ROW_COUNT),
      7, 8, 9, 10, 11, 12, 13);
  assertRows(calculateRowRanges(FilterCompat.get(
      gt(intColumn("missing_column"), 0)),
      STORE, paths, TOTAL_ROW_COUNT));
}
 
Example #29
Source File: TestColumnIndexFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testFilteringWithMissingOffsetIndex() {
  Set<ColumnPath> paths = paths("column1", "column2", "column3", "column4", "column_wo_oi");

  assertAllRows(calculateRowRanges(FilterCompat.get(
      and(
          and(
              gtEq(intColumn("column1"), 7),
              lt(intColumn("column1"), 11)),
          and(
              gt(binaryColumn("column2"), fromString("Romeo")),
              ltEq(binaryColumn("column_wo_oi"), fromString("Tango"))))),
      STORE, paths, TOTAL_ROW_COUNT),
      TOTAL_ROW_COUNT);
}
 
Example #30
Source File: ColumnIndexValidator.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ContractViolation(Contract violatedContract, String referenceValue, String offendingValue,
    int rowGroupNumber, int columnNumber, ColumnPath columnPath, int pageNumber) {
  this.violatedContract = violatedContract;
  this.referenceValue = referenceValue;
  this.offendingValue = offendingValue;
  this.rowGroupNumber = rowGroupNumber;
  this.columnNumber = columnNumber;
  this.columnPath = columnPath;
  this.pageNumber = pageNumber;
}