org.apache.parquet.format.SchemaElement Java Examples

The following examples show how to use org.apache.parquet.format.SchemaElement. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetReaderUtility.java    From Bats with Apache License 2.0 6 votes vote down vote up
/**
 * Populate full path to SchemaElement map by recursively traversing schema elements referenced by the given iterator
 *
 * @param iter file schema values iterator
 * @param path parent schema element path
 * @param schemaElements schema elements map to insert next iterator element into
 */
private static void addSchemaElementMapping(Iterator<SchemaElement> iter, StringBuilder path,
    Map<String, SchemaElement> schemaElements) {
  SchemaElement schemaElement = iter.next();
  path.append('`').append(schemaElement.getName().toLowerCase()).append('`');
  schemaElements.put(path.toString(), schemaElement);

  // for each element that has children we need to maintain remaining children count
  // to exit current recursion level when no more children is left
  int remainingChildren = schemaElement.getNum_children();

  while (remainingChildren > 0 && iter.hasNext()) {
    addSchemaElementMapping(iter, new StringBuilder(path).append('.'), schemaElements);
    remainingChildren--;
  }
  return;
}
 
Example #2
Source File: ParquetReaderUtility.java    From Bats with Apache License 2.0 6 votes vote down vote up
/**
 * Map full schema paths in format `a`.`b`.`c` to respective SchemaElement objects.
 *
 * @param footer Parquet file metadata
 * @return       schema full path to SchemaElement map
 */
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  Map<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);

  Iterator<SchemaElement> iter = fileMetaData.getSchema().iterator();

  // First element in collection is default `root` element. We skip it to maintain key in `a` format instead of `root`.`a`,
  // and thus to avoid the need to cut it out again when comparing with SchemaPath string representation
  if (iter.hasNext()) {
    iter.next();
  }
  while (iter.hasNext()) {
    addSchemaElementMapping(iter, new StringBuilder(), schemaElements);
  }
  return schemaElements;
}
 
Example #3
Source File: ColumnReader.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
protected ColumnReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
                       ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v, SchemaElement schemaElement) throws ExecutionSetupException {
  this.parentReader = parentReader;
  this.columnDescriptor = descriptor;
  this.columnChunkMetaData = columnChunkMetaData;
  this.isFixedLength = fixedLength;
  this.schemaElement = schemaElement;
  this.valueVec =  v;
  this.pageReader = (parentReader.getSingleStream() != null)?
    new DeprecatedSingleStreamPageReader(this, parentReader.getSingleStream(), parentReader.getFsPath(), columnChunkMetaData) :
    new PageReader(this, parentReader.getFileSystem(), parentReader.getFsPath(), columnChunkMetaData);

  if (columnDescriptor.getType() != PrimitiveType.PrimitiveTypeName.BINARY) {
    if (columnDescriptor.getType() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) {
      dataTypeLengthInBits = columnDescriptor.getTypeLength() * 8;
    } else if (columnDescriptor.getType() == PrimitiveTypeName.INT96
      && valueVec instanceof TimeStampMilliVector) {
      // if int 96 column is being read as a Timestamp, this truncates the time format used by Impala
      // dataTypeLengthInBits is only ever used when computing offsets into the destination vector, so it
      // needs to be set to the bit width of the resulting Arrow type, usually this matches the input length
      dataTypeLengthInBits = 64;
    } else {
      dataTypeLengthInBits = DeprecatedParquetVectorizedReader.getTypeLengthInBits(columnDescriptor.getType());
    }
  }
}
 
Example #4
Source File: VarLengthValuesColumn.java    From Bats with Apache License 2.0 6 votes vote down vote up
VarLengthValuesColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                      ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v,
                      SchemaElement schemaElement) throws ExecutionSetupException {

  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  variableWidthVector = (VariableWidthVector) valueVec;

  if (columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
    usingDictionary = true;
    // We didn't implement the fixed length optimization when a Parquet Dictionary is used; as there are
    // no data point about this use-case. Will also enable bulk processing by default since early data
    // profiling (for detecting the best processing strategy to use) is disabled when the column precision
    // is already set.
    bulkReaderState.columnPrecInfo.columnPrecisionType = ColumnPrecisionType.DT_PRECISION_IS_VARIABLE;
    bulkReaderState.columnPrecInfo.bulkProcess         = true;
  }
  else {
    usingDictionary = false;
  }
}
 
Example #5
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private FileMetaData metadata(long... sizes) {
  List<SchemaElement> schema = emptyList();
  List<RowGroup> rowGroups = new ArrayList<RowGroup>();
  long offset = 0;
  for (long size : sizes) {
    ColumnChunk columnChunk = new ColumnChunk(offset);
    columnChunk.setMeta_data(new ColumnMetaData(
        INT32,
        Collections.<org.apache.parquet.format.Encoding>emptyList(),
        Collections.<String>emptyList(),
        UNCOMPRESSED, 10l, size * 2, size, offset));
    rowGroups.add(new RowGroup(Arrays.asList(columnChunk), size, 1));
    offset += size;
  }
  return new FileMetaData(1, schema, sizes.length, rowGroups);
}
 
Example #6
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testIncompatibleLogicalAndConvertedTypes() {
  ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
  MessageType schema = Types.buildMessage()
    .required(PrimitiveTypeName.BINARY)
    .as(OriginalType.DECIMAL).precision(9).scale(2)
    .named("aBinary")
    .named("Message");
  MessageType expected = Types.buildMessage()
    .required(PrimitiveTypeName.BINARY)
    .as(LogicalTypeAnnotation.jsonType())
    .named("aBinary")
    .named("Message");

  List<SchemaElement> parquetSchema = parquetMetadataConverter.toParquetSchema(schema);
  // Set converted type field to a different type to verify that in case of mismatch, it overrides logical type
  parquetSchema.get(1).setConverted_type(ConvertedType.JSON);
  MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
  assertEquals(expected, actual);
}
 
Example #7
Source File: ParquetToMinorTypeConverter.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static TypeProtos.MajorType toMajorType(PrimitiveType.PrimitiveTypeName primitiveTypeName, int length,
                                        TypeProtos.DataMode mode, SchemaElement schemaElement,
                                        OptionManager options, Field arrowField, final boolean readInt96AsTimeStamp) {
  MinorType minorType = getMinorType(primitiveTypeName, length, schemaElement, options, arrowField, readInt96AsTimeStamp);
  TypeProtos.MajorType.Builder typeBuilder = TypeProtos.MajorType.newBuilder().setMinorType(minorType).setMode(mode);

  if (CoreDecimalUtility.isDecimalType(minorType)) {
    typeBuilder.setPrecision(schemaElement.getPrecision()).setScale(schemaElement.getScale());
  }
  return typeBuilder.build();
}
 
Example #8
Source File: VarLengthColumnReaders.java    From Bats with Apache License 2.0 5 votes vote down vote up
NullableVarDecimalColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                        ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, NullableVarDecimalVector v,
                        SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  nullableVarDecimalVector = v;
  this.mutator = v.getMutator();
}
 
Example #9
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogicalTypesBackwardCompatibleWithConvertedTypes() {
  ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
  MessageType expected = Types.buildMessage()
    .required(PrimitiveTypeName.BINARY)
    .as(OriginalType.DECIMAL).precision(9).scale(2)
    .named("aBinaryDecimal")
    .named("Message");
  List<SchemaElement> parquetSchema = parquetMetadataConverter.toParquetSchema(expected);
  // Set logical type field to null to test backward compatibility with files written by older API,
  // where converted_types are written to the metadata, but logicalType is missing
  parquetSchema.get(1).setLogicalType(null);
  MessageType schema = parquetMetadataConverter.fromParquetSchema(parquetSchema, null);
  assertEquals(expected, schema);
}
 
Example #10
Source File: VarLengthColumnReaders.java    From Bats with Apache License 2.0 5 votes vote down vote up
VarCharColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
              ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, VarCharVector v,
              SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  this.varCharVector = v;
  this.mutator       = v.getMutator();
}
 
Example #11
Source File: VarLengthColumn.java    From Bats with Apache License 2.0 5 votes vote down vote up
VarLengthColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v,
                SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
    if (columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
      usingDictionary = true;
    }
    else {
      usingDictionary = false;
    }
}
 
Example #12
Source File: VarLengthColumnReaders.java    From Bats with Apache License 2.0 5 votes vote down vote up
VarDecimalColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, VarDecimalVector v,
                SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  this.varDecimalVector = v;
  this.mutator = v.getMutator();
}
 
Example #13
Source File: NullableFixedByteAlignedReaders.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
CorruptionDetectingNullableDateReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize,
    ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData,
    boolean fixedLength, DateMilliVector v, SchemaElement schemaElement)
        throws ExecutionSetupException {
  super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  dateVector = v;
}
 
Example #14
Source File: VarLengthColumnReaders.java    From Bats with Apache License 2.0 5 votes vote down vote up
NullableVarBinaryColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                        ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, NullableVarBinaryVector v,
                        SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  this.nullableVarBinaryVector = v;
  this.mutator                 = v.getMutator();
}
 
Example #15
Source File: VarLengthColumnReaders.java    From Bats with Apache License 2.0 5 votes vote down vote up
VarBinaryColumn(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, VarBinaryVector v,
                SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);

  this.varBinaryVector = v;
  this.mutator         = v.getMutator();
}
 
Example #16
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testSchemaConverterDecimal() {
  ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter();
  List<SchemaElement> schemaElements = parquetMetadataConverter.toParquetSchema(
      Types.buildMessage()
          .required(PrimitiveTypeName.BINARY)
              .as(OriginalType.DECIMAL).precision(9).scale(2)
              .named("aBinaryDecimal")
          .optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(4)
              .as(OriginalType.DECIMAL).precision(9).scale(2)
              .named("aFixedDecimal")
          .named("Message")
  );
  List<SchemaElement> expected = Lists.newArrayList(
      new SchemaElement("Message").setNum_children(2),
      new SchemaElement("aBinaryDecimal")
          .setRepetition_type(FieldRepetitionType.REQUIRED)
          .setType(Type.BYTE_ARRAY)
          .setConverted_type(ConvertedType.DECIMAL)
          .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9)))
          .setPrecision(9).setScale(2),
      new SchemaElement("aFixedDecimal")
          .setRepetition_type(FieldRepetitionType.OPTIONAL)
          .setType(Type.FIXED_LEN_BYTE_ARRAY)
          .setType_length(4)
          .setConverted_type(ConvertedType.DECIMAL)
          .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9)))
          .setPrecision(9).setScale(2)
  );
  Assert.assertEquals(expected, schemaElements);
}
 
Example #17
Source File: ParquetColumnMetadata.java    From Bats with Apache License 2.0 5 votes vote down vote up
public void resolveDrillType(Map<String, SchemaElement> schemaElements, OptionManager options) {
  se = schemaElements.get(ParquetReaderUtility.getFullColumnPath(column));
  type = ParquetToDrillTypeConverter.toMajorType(column.getType(), column.getTypeLength(),
      getDataMode(column), se, options);
  field = MaterializedField.create(toFieldName(column.getPath()).getLastSegment().getNameSegment().getPath(), type);
  length = getDataTypeLength();
}
 
Example #18
Source File: ColumnReaderFactory.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
static VarLengthValuesColumn<?> getReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
                                          ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v,
                                          SchemaElement schemaElement
) throws ExecutionSetupException {
  ConvertedType convertedType = schemaElement.getConverted_type();
  switch (descriptor.getMaxDefinitionLevel()) {
    case 0:
      if (convertedType == null) {
        return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
      switch (convertedType) {
        case UTF8:
          return new VarLengthColumnReaders.VarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
        case DECIMAL:
          return new VarLengthColumnReaders.Decimal28Column(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement);
        default:
          return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
    default:
      if (convertedType == null) {
        return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }

      switch (convertedType) {
        case UTF8:
          return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
        case DECIMAL:
          return new NullableDecimalColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement);
        default:
          return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
  }
}
 
Example #19
Source File: ColumnReaderFactory.java    From Bats with Apache License 2.0 5 votes vote down vote up
static VarLengthValuesColumn<?> getReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor,
                                        ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v,
                                        SchemaElement schemaElement
) throws ExecutionSetupException {
  ConvertedType convertedType = schemaElement.getConverted_type();
  switch (descriptor.getMaxDefinitionLevel()) {
    case 0:
      if (convertedType == null) {
        return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
      switch (convertedType) {
        case UTF8:
        case ENUM:
          return new VarLengthColumnReaders.VarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement);
        case DECIMAL:
          if (v instanceof VarDecimalVector) {
            return new VarLengthColumnReaders.VarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarDecimalVector) v, schemaElement);
          }
        default:
          return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement);
      }
    default:
      if (convertedType == null) {
        return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement);
      }

      switch (convertedType) {
        case UTF8:
        case ENUM:
          return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarCharVector) v, schemaElement);
        case DECIMAL:
          if (v instanceof NullableVarDecimalVector) {
            return new VarLengthColumnReaders.NullableVarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) v, schemaElement);
          }
        default:
          return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement);
      }
  }
}
 
Example #20
Source File: NullableColumnReader.java    From Bats with Apache License 2.0 5 votes vote down vote up
NullableColumnReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData,
             boolean fixedLength, V v, SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);

  castedBaseVector = (BaseDataValueVector) v;
  castedVectorMutator = (NullableVectorDefinitionSetter) v.getMutator();
}
 
Example #21
Source File: ParquetToDrillTypeConverter.java    From Bats with Apache License 2.0 5 votes vote down vote up
public static TypeProtos.MajorType toMajorType(PrimitiveType.PrimitiveTypeName primitiveTypeName, int length,
                                        TypeProtos.DataMode mode, SchemaElement schemaElement,
                                        OptionManager options) {
  ConvertedType convertedType = schemaElement.getConverted_type();
  MinorType minorType = getMinorType(primitiveTypeName, length, convertedType, options);
  TypeProtos.MajorType.Builder typeBuilder = TypeProtos.MajorType.newBuilder().setMinorType(minorType).setMode(mode);

  if (Types.isDecimalType(minorType)) {
    int precision = schemaElement.getPrecision();
    int scale = schemaElement.getScale();

    typeBuilder.setPrecision(precision).setScale(scale);
  }
  return typeBuilder.build();
}
 
Example #22
Source File: MetadataReader.java    From presto with Apache License 2.0 5 votes vote down vote up
private static MessageType readParquetSchema(List<SchemaElement> schema)
{
    Iterator<SchemaElement> schemaIterator = schema.iterator();
    SchemaElement rootSchema = schemaIterator.next();
    Types.MessageTypeBuilder builder = Types.buildMessage();
    readTypeSchema(builder, schemaIterator, rootSchema.getNum_children());
    return builder.named(rootSchema.name);
}
 
Example #23
Source File: ParquetReaderUtility.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  HashMap<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
  for (SchemaElement se : fileMetaData.getSchema()) {
    schemaElements.put(se.getName(), se);
  }
  return schemaElements;
}
 
Example #24
Source File: VarLengthColumn.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
VarLengthColumn(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
                ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v,
                SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
    if (columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) {
      usingDictionary = true;
    }
    else {
      usingDictionary = false;
    }
}
 
Example #25
Source File: ParquetReaderUtility.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
/**
 * Detect corrupt date values by looking at the min/max values in the metadata.
 *
 * This should only be used when a file does not have enough metadata to determine if
 * the data was written with an older version of Drill, or an external tool. Drill
 * versions 1.3 and beyond should have enough metadata to confirm that the data was written
 * by Drill.
 *
 * This method only checks the first Row Group, because Drill has only ever written
 * a single Row Group per file.
 *
 * @param footer
 * @param columns
 * @param autoCorrectCorruptDates user setting to allow enabling/disabling of auto-correction
 *                                of corrupt dates. There are some rare cases (storing dates thousands
 *                                of years into the future, with tools other than Drill writing files)
 *                                that would result in the date values being "corrected" into bad values.
 */
public static DateCorruptionStatus checkForCorruptDateValuesInStatistics(ParquetMetadata footer,
                                                            List<SchemaPath> columns,
                                                            boolean autoCorrectCorruptDates) {
  // Users can turn-off date correction in cases where we are detecting corruption based on the date values
  // that are unlikely to appear in common datasets. In this case report that no correction needs to happen
  // during the file read
  if (! autoCorrectCorruptDates) {
    return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
  }
  // Drill produced files have only ever have a single row group, if this changes in the future it won't matter
  // as we will know from the Drill version written in the files that the dates are correct
  int rowGroupIndex = 0;
  Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
  findDateColWithStatsLoop : for (SchemaPath schemaPath : columns) {
    List<ColumnDescriptor> parquetColumns = footer.getFileMetaData().getSchema().getColumns();
    for (int i = 0; i < parquetColumns.size(); ++i) {
      ColumnDescriptor column = parquetColumns.get(i);
      // this reader only supports flat data, this is restricted in the ParquetScanBatchCreator
      // creating a NameSegment makes sure we are using the standard code for comparing names,
      // currently it is all case-insensitive
      if (ColumnUtils.isStarQuery(columns) || new PathSegment.NameSegment(column.getPath()[0]).equals(schemaPath.getRootSegment())) {
        int colIndex = -1;
        ConvertedType convertedType = schemaElements.get(column.getPath()[0]).getConverted_type();
        if (convertedType != null && convertedType.equals(ConvertedType.DATE)) {
          List<ColumnChunkMetaData> colChunkList = footer.getBlocks().get(rowGroupIndex).getColumns();
          for (int j = 0; j < colChunkList.size(); j++) {
            if (colChunkList.get(j).getPath().equals(ColumnPath.get(column.getPath()))) {
              colIndex = j;
              break;
            }
          }
        }
        if (colIndex == -1) {
          // column does not appear in this file, skip it
          continue;
        }
        Statistics statistics = footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics();
        Integer max = (Integer) statistics.genericGetMax();
        if (statistics.hasNonNullValue()) {
          if (max > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) {
            return DateCorruptionStatus.META_SHOWS_CORRUPTION;
          }
        } else {
          // no statistics, go check the first page
          return DateCorruptionStatus.META_UNCLEAR_TEST_VALUES;
        }
      }
    }
  }
  return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION;
}
 
Example #26
Source File: VarLengthColumnReaders.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
Decimal28Column(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
                ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, DecimalVector v,
                SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  this.decimalVector = v;
}
 
Example #27
Source File: BitReader.java    From Bats with Apache License 2.0 4 votes vote down vote up
BitReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData,
          boolean fixedLength, BitVector v, SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
}
 
Example #28
Source File: NullableVarLengthValuesColumn.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
NullableVarLengthValuesColumn(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor,
                              ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, V v,
                              SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
}
 
Example #29
Source File: FixedByteAlignedReader.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
CorruptDateReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData,
                  boolean fixedLength, DateMilliVector v, SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
  vector = v;
}
 
Example #30
Source File: FixedByteAlignedReader.java    From Bats with Apache License 2.0 4 votes vote down vote up
IntervalReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData,
               boolean fixedLength, IntervalVector v, SchemaElement schemaElement) throws ExecutionSetupException {
  super(parentReader, descriptor, columnChunkMetaData, fixedLength, v, schemaElement);
}