org.apache.parquet.io.ParquetDecodingException Java Examples

The following examples show how to use org.apache.parquet.io.ParquetDecodingException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param configuration the configuration to connect to the file system
 * @param footers the footers of the files to read
 * @return the splits for the footers
 * @throws IOException if there is an error while reading
 * @deprecated split planning using file footers will be removed
 */
@Deprecated
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
  boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
  final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L));
  if (maxSplitSize < 0 || minSplitSize < 0) {
    throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize);
  }
  GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking);
  ReadContext readContext = getReadSupport(configuration).init(new InitContext(
      configuration,
      globalMetaData.getKeyValueMetaData(),
      globalMetaData.getSchema()));

  return new ClientSideMetadataSplitStrategy().getSplits(
      configuration, footers, maxSplitSize, minSplitSize, readContext);
}
 
Example #2
Source File: TupleReadSupport.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param fileSchema the parquet schema from the file
 * @param keyValueMetaData the extra meta data from the files
 * @return the pig schema according to the file
 */
static Schema getPigSchemaFromMultipleFiles(MessageType fileSchema, Map<String, Set<String>> keyValueMetaData) {
  Set<String> pigSchemas = PigMetaData.getPigSchemas(keyValueMetaData);
  if (pigSchemas == null) {
    return pigSchemaConverter.convert(fileSchema);
  }
  Schema mergedPigSchema = null;
  for (String pigSchemaString : pigSchemas) {
    try {
      mergedPigSchema = union(mergedPigSchema, parsePigSchema(pigSchemaString));
    } catch (FrontendException e) {
      throw new ParquetDecodingException("can not merge " + pigSchemaString + " into " + mergedPigSchema, e);
    }
  }
  return mergedPigSchema;
}
 
Example #3
Source File: UnmaterializableRecordCounter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public void incErrors(RecordMaterializationException cause) throws ParquetDecodingException {
  numErrors++;

  LOG.warn(String.format("Error while reading an input record (%s out of %s): ",
      numErrors, totalNumRecords), cause);

  if (numErrors > 0 && errorThreshold <= 0) { // no errors are tolerated
    throw new ParquetDecodingException("Error while decoding records", cause);
  }

  double errRate = numErrors/(double)totalNumRecords;

  if (errRate > errorThreshold) {
    String message = String.format("Decoding error rate of at least %s/%s crosses configured threshold of %s",
        numErrors, totalNumRecords, errorThreshold);
    LOG.error(message);
    throw new ParquetDecodingException(message, cause);
  }
}
 
Example #4
Source File: ColumnReaderBase.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void readPageV1(DataPageV1 page) {
  ValuesReader rlReader = page.getRlEncoding().getValuesReader(path, REPETITION_LEVEL);
  ValuesReader dlReader = page.getDlEncoding().getValuesReader(path, DEFINITION_LEVEL);
  this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
  this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
  int valueCount = page.getValueCount();
  try {
    BytesInput bytes = page.getBytes();
    LOG.debug("page size {} bytes and {} values", bytes.size(), valueCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(valueCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    dlReader.initFromPage(valueCount, in);
    LOG.debug("reading data at {}", in.position());
    initDataReader(page.getValueEncoding(), in, valueCount);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + page + " in col " + path, e);
  }
  newPageInitialized(page);
}
 
Example #5
Source File: ColumnReaderBase.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * creates a reader for triplets
 * @param path the descriptor for the corresponding column
 * @param pageReader the underlying store to read from
 * @param converter a converter that materializes the values in this column in the current record
 * @param writerVersion writer version string from the Parquet file being read
 */
ColumnReaderBase(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion) {
  this.path = Objects.requireNonNull(path, "path cannot be null");
  this.pageReader = Objects.requireNonNull(pageReader, "pageReader cannot be null");
  this.converter = Objects.requireNonNull(converter, "converter cannot be null");
  this.writerVersion = writerVersion;
  this.maxDefinitionLevel = path.getMaxDefinitionLevel();
  DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
  if (dictionaryPage != null) {
    try {
      this.dictionary = dictionaryPage.getEncoding().initDictionary(path, dictionaryPage);
      if (converter.hasDictionarySupport()) {
        converter.setDictionary(dictionary);
      }
    } catch (IOException e) {
      throw new ParquetDecodingException("could not decode the dictionary for " + path, e);
    }
  } else {
    this.dictionary = null;
  }
  this.totalValueCount = pageReader.getTotalValueCount();
  if (totalValueCount <= 0) {
    throw new ParquetDecodingException("totalValueCount '" + totalValueCount + "' <= 0");
  }
}
 
Example #6
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * @param rowGroupMetadata
 * @return true if the mid point of row group is in a new hdfs block, and also move the currentHDFSBlock pointer to the correct index that contains the row group;
 * return false if the mid point of row group is in the same hdfs block
 */
private boolean checkBelongingToANewHDFSBlock(BlockMetaData rowGroupMetadata) {
  boolean isNewHdfsBlock = false;
  long rowGroupMidPoint = rowGroupMetadata.getStartingPos() + (rowGroupMetadata.getCompressedSize() / 2);

  //if mid point is not in the current HDFS block any more, return true
  while (rowGroupMidPoint > getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex)) {
    isNewHdfsBlock = true;
    currentMidPointHDFSBlockIndex++;
    if (currentMidPointHDFSBlockIndex >= hdfsBlocks.length)
      throw new ParquetDecodingException("the row group is not in hdfs blocks in the file: midpoint of row groups is "
              + rowGroupMidPoint
              + ", the end of the hdfs block is "
              + getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex - 1));
  }

  while (rowGroupMetadata.getStartingPos() > getHDFSBlockEndingPosition(currentStartHdfsBlockIndex)) {
    currentStartHdfsBlockIndex++;
    if (currentStartHdfsBlockIndex >= hdfsBlocks.length)
      throw new ParquetDecodingException("The row group does not start in this file: row group offset is "
              + rowGroupMetadata.getStartingPos()
              + " but the end of hdfs blocks of file is "
              + getHDFSBlockEndingPosition(currentStartHdfsBlockIndex));
  }
  return isNewHdfsBlock;
}
 
Example #7
Source File: PageIterator.java    From iceberg with Apache License 2.0 6 votes vote down vote up
RuntimeException handleRuntimeException(RuntimeException exception) {
  if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, valueEncoding) &&
      exception instanceof ArrayIndexOutOfBoundsException) {
    // this is probably PARQUET-246, which may happen if reading data with
    // MR because this can't be detected without reading all footers
    throw new ParquetDecodingException("Read failure possibly due to " +
        "PARQUET-246: try setting parquet.split.files to false",
        new ParquetDecodingException(
            String.format("Can't read value in column %s at value %d out of %d in current page. " +
                          "repetition level: %d, definition level: %d",
                desc, triplesRead, triplesCount, currentRL, currentDL),
            exception));
  }
  throw new ParquetDecodingException(
      String.format("Can't read value in column %s at value %d out of %d in current page. " +
                    "repetition level: %d, definition level: %d",
          desc, triplesRead, triplesCount, currentRL, currentDL),
      exception);
}
 
Example #8
Source File: BooleanColumnReader.java    From flink with Apache License 2.0 6 votes vote down vote up
private boolean readBoolean() {
	if (bitOffset == 0) {
		try {
			currentByte = (byte) dataInputStream.read();
		} catch (IOException e) {
			throw new ParquetDecodingException("Failed to read a byte", e);
		}
	}

	boolean v = (currentByte & (1 << bitOffset)) != 0;
	bitOffset += 1;
	if (bitOffset == 8) {
		bitOffset = 0;
	}
	return v;
}
 
Example #9
Source File: ThriftRecordConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public ElementConverter(String listName, List<TProtocol> listEvents,
                        GroupType repeatedType, ThriftField thriftElement) {
  this.listEvents = listEvents;
  this.elementEvents = new ArrayList<TProtocol>();
  Type elementType = repeatedType.getType(0);
  if (elementType.isRepetition(Type.Repetition.OPTIONAL)) {
    if (ignoreNullElements) {
      LOG.warn("List " + listName +
          " has optional elements: null elements are ignored.");
    } else {
      throw new ParquetDecodingException("Cannot read list " + listName +
          " with optional elements: set " + IGNORE_NULL_LIST_ELEMENTS +
          " to ignore nulls.");
    }
  }
  elementConverter = newConverter(elementEvents, elementType, thriftElement);
}
 
Example #10
Source File: ParquetTypeUtils.java    From presto with Apache License 2.0 6 votes vote down vote up
@SuppressWarnings("deprecation")
public static ParquetEncoding getParquetEncoding(Encoding encoding)
{
    switch (encoding) {
        case PLAIN:
            return ParquetEncoding.PLAIN;
        case RLE:
            return ParquetEncoding.RLE;
        case BIT_PACKED:
            return ParquetEncoding.BIT_PACKED;
        case PLAIN_DICTIONARY:
            return ParquetEncoding.PLAIN_DICTIONARY;
        case DELTA_BINARY_PACKED:
            return ParquetEncoding.DELTA_BINARY_PACKED;
        case DELTA_LENGTH_BYTE_ARRAY:
            return ParquetEncoding.DELTA_LENGTH_BYTE_ARRAY;
        case DELTA_BYTE_ARRAY:
            return ParquetEncoding.DELTA_BYTE_ARRAY;
        case RLE_DICTIONARY:
            return ParquetEncoding.RLE_DICTIONARY;
        default:
            throw new ParquetDecodingException("Unsupported Parquet encoding: " + encoding);
    }
}
 
Example #11
Source File: RunLengthBitPackingHybridDecoder.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public int readInt() throws IOException {
  if (currentCount == 0) {
    readNext();
  }
  -- currentCount;
  int result;
  switch (mode) {
  case RLE:
    result = currentValue;
    break;
  case PACKED:
    result = currentBuffer[currentBuffer.length - 1 - currentCount];
    break;
  default:
    throw new ParquetDecodingException("not a valid mode " + mode);
  }
  return result;
}
 
Example #12
Source File: ParquetRecordReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void checkDeltaByteArrayProblem(FileMetaData meta, Configuration conf, BlockMetaData block) {
  // splitting files?
  if (conf.getBoolean(ParquetInputFormat.SPLIT_FILES, true)) {
    // this is okay if not using DELTA_BYTE_ARRAY with the bug
    Set<Encoding> encodings = new HashSet<Encoding>();
    for (ColumnChunkMetaData column : block.getColumns()) {
      encodings.addAll(column.getEncodings());
    }
    for (Encoding encoding : encodings) {
      if (CorruptDeltaByteArrays.requiresSequentialReads(meta.getCreatedBy(), encoding)) {
        throw new ParquetDecodingException("Cannot read data due to " +
            "PARQUET-246: to read safely, set " + SPLIT_FILES + " to false");
      }
    }
  }
}
 
Example #13
Source File: PrimitiveColumnReader.java    From presto with Apache License 2.0 6 votes vote down vote up
public void setPageReader(PageReader pageReader)
{
    this.pageReader = requireNonNull(pageReader, "pageReader");
    DictionaryPage dictionaryPage = pageReader.readDictionaryPage();

    if (dictionaryPage != null) {
        try {
            dictionary = dictionaryPage.getEncoding().initDictionary(columnDescriptor, dictionaryPage);
        }
        catch (IOException e) {
            throw new ParquetDecodingException("could not decode the dictionary for " + columnDescriptor, e);
        }
    }
    else {
        dictionary = null;
    }
    checkArgument(pageReader.getTotalValueCount() > 0, "page is empty");
    totalValueCount = pageReader.getTotalValueCount();
}
 
Example #14
Source File: PrimitiveColumnReader.java    From presto with Apache License 2.0 6 votes vote down vote up
private ValuesReader readPageV1(DataPageV1 page)
{
    ValuesReader rlReader = page.getRepetitionLevelEncoding().getValuesReader(columnDescriptor, REPETITION_LEVEL);
    ValuesReader dlReader = page.getDefinitionLevelEncoding().getValuesReader(columnDescriptor, DEFINITION_LEVEL);
    repetitionReader = new LevelValuesReader(rlReader);
    definitionReader = new LevelValuesReader(dlReader);
    try {
        ByteBufferInputStream in = toInputStream(page.getSlice());
        rlReader.initFromPage(page.getValueCount(), in);
        dlReader.initFromPage(page.getValueCount(), in);
        return initDataReader(page.getValueEncoding(), page.getValueCount(), in);
    }
    catch (IOException e) {
        throw new ParquetDecodingException("Error reading parquet page " + page + " in column " + columnDescriptor, e);
    }
}
 
Example #15
Source File: PrimitiveColumnReader.java    From presto with Apache License 2.0 6 votes vote down vote up
private ValuesReader initDataReader(ParquetEncoding dataEncoding, int valueCount, ByteBufferInputStream in)
{
    ValuesReader valuesReader;
    if (dataEncoding.usesDictionary()) {
        if (dictionary == null) {
            throw new ParquetDecodingException("Dictionary is missing for Page");
        }
        valuesReader = dataEncoding.getDictionaryBasedValuesReader(columnDescriptor, VALUES, dictionary);
    }
    else {
        valuesReader = dataEncoding.getValuesReader(columnDescriptor, VALUES);
    }

    try {
        valuesReader.initFromPage(valueCount, in);
        return valuesReader;
    }
    catch (IOException e) {
        throw new ParquetDecodingException("Error reading parquet page in column " + columnDescriptor, e);
    }
}
 
Example #16
Source File: ThriftRecordConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void addBinary(final Binary value) {
  final Integer id = enumLookup.get(value);

  if (id == null) {
    throw new ParquetDecodingException("Unrecognized enum value: "
        + value.toStringUsingUTF8()
        + " known values: "
        + enumLookup
        + " in " + this.field);
  }

  events.add(new ParquetProtocol("readI32() enum") {
    @Override
    public int readI32() throws TException {
      return id;
    }
  });
}
 
Example #17
Source File: DeltaBinaryPackingValuesReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void loadNewBlockToBuffer() throws IOException {
  try {
    minDeltaInCurrentBlock = BytesUtils.readZigZagVarLong(in);
  } catch (IOException e) {
    throw new ParquetDecodingException("can not read min delta in current block", e);
  }

  readBitWidthsForMiniBlocks();

  // mini block is atomic for reading, we read a mini block when there are more values left
  int i;
  for (i = 0; i < config.miniBlockNumInABlock && valuesBuffered < totalValueCount; i++) {
    BytePackerForLong packer = Packer.LITTLE_ENDIAN.newBytePackerForLong(bitWidths[i]);
    unpackMiniBlock(packer);
  }

  //calculate values from deltas unpacked for current block
  int valueUnpacked=i*config.miniBlockSizeInValues;
  for (int j = valuesBuffered-valueUnpacked; j < valuesBuffered; j++) {
    int index = j;
    valuesBuffer[index] += minDeltaInCurrentBlock + valuesBuffer[index - 1];
  }
}
 
Example #18
Source File: ColumnChunkPageReadStore.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public DictionaryPage readDictionaryPage() {
  if (compressedDictionaryPage == null) {
    return null;
  }
  try {
    DictionaryPage decompressedPage = new DictionaryPage(
      decompressor.decompress(compressedDictionaryPage.getBytes(), compressedDictionaryPage.getUncompressedSize()),
      compressedDictionaryPage.getDictionarySize(),
      compressedDictionaryPage.getEncoding());
    if (compressedDictionaryPage.getCrc().isPresent()) {
      decompressedPage.setCrc(compressedDictionaryPage.getCrc().getAsInt());
    }
    return decompressedPage;
  } catch (IOException e) {
    throw new ParquetDecodingException("Could not decompress dictionary page", e);
  }
}
 
Example #19
Source File: ProtoMessageConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public ListConverter(Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) {
  LogicalTypeAnnotation logicalTypeAnnotation = parquetType.getLogicalTypeAnnotation();
  if (!(logicalTypeAnnotation instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation) || parquetType.isPrimitive()) {
    throw new ParquetDecodingException("Expected LIST wrapper. Found: " + logicalTypeAnnotation + " instead.");
  }

  GroupType rootWrapperType = parquetType.asGroupType();
  if (!rootWrapperType.containsField("list") || rootWrapperType.getType("list").isPrimitive()) {
    throw new ParquetDecodingException("Expected repeated 'list' group inside LIST wrapperr but got: " + rootWrapperType);
  }

  GroupType listType = rootWrapperType.getType("list").asGroupType();
  if (!listType.containsField("element")) {
    throw new ParquetDecodingException("Expected 'element' inside repeated list group but got: " + listType);
  }

  Type elementType = listType.getType("element");
  converter = newMessageConverter(parentBuilder, fieldDescriptor, elementType);
}
 
Example #20
Source File: TupleReadSupport.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public RecordMaterializer<Tuple> prepareForRead(
    Configuration configuration,
    Map<String, String> keyValueMetaData,
    MessageType fileSchema,
    ReadContext readContext) {
  MessageType requestedSchema = readContext.getRequestedSchema();
  Schema requestedPigSchema = getPigSchema(configuration);

  if (requestedPigSchema == null) {
    throw new ParquetDecodingException("Missing Pig schema: ParquetLoader sets the schema in the job conf");
  }
  boolean elephantBirdCompatible = configuration.getBoolean(PARQUET_PIG_ELEPHANT_BIRD_COMPATIBLE, false);
  boolean columnIndexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);
  if (elephantBirdCompatible) {
    LOG.info("Numbers will default to 0 instead of NULL; Boolean will be converted to Int");
  }
  return new TupleRecordMaterializer(requestedSchema, requestedPigSchema, elephantBirdCompatible, columnIndexAccess);
}
 
Example #21
Source File: ByteStreamSplitValuesReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void skip(int n) {
  if (n < 0 || indexInStream + n > valuesCount) {
    String errorMessage = String.format(
            "Cannot skip this many elements. Current index: %d. Skip %d. Total number of elements: %d",
            indexInStream, n, valuesCount);
    throw new ParquetDecodingException(errorMessage);
  }
  indexInStream += n;
}
 
Example #22
Source File: PlainValuesReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void skip(int n) {
  try {
    in.skipBytes(n * 8);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not skip " + n + " longs", e);
  }
}
 
Example #23
Source File: PlainValuesReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public int readInteger() {
  try {
    return in.readInt();
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read int", e);
  }
}
 
Example #24
Source File: AvroConverters.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public FieldStringableConverter(ParentValueContainer parent,
                                Class<?> stringableClass) {
  super(parent);
  stringableName = stringableClass.getName();
  try {
    this.ctor = stringableClass.getConstructor(String.class);
  } catch (NoSuchMethodException e) {
    throw new ParquetDecodingException(
        "Unable to get String constructor for " + stringableName, e);
  }
}
 
Example #25
Source File: PlainValuesReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void skip(int n) {
  try {
    skipBytesFully(n * 8);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not skip " + n + " double values", e);
  }
}
 
Example #26
Source File: RunLengthBitPackingHybridValuesReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public int readInteger() {
  try {
    return decoder.readInt();
  } catch (IOException e) {
    throw new ParquetDecodingException(e);
  }
}
 
Example #27
Source File: BitPackingValuesReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 * @see org.apache.parquet.column.values.ValuesReader#readInteger()
 */
@Override
public int readInteger() {
  try {
    return bitPackingReader.read();
  } catch (IOException e) {
    throw new ParquetDecodingException(e);
  }
}
 
Example #28
Source File: ParquetInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static void checkSorted(List<BlockMetaData> rowGroupBlocks) {
  long previousOffset = 0L;
  for(BlockMetaData rowGroup: rowGroupBlocks) {
    long currentOffset = rowGroup.getStartingPos();
    if (currentOffset < previousOffset) {
      throw new ParquetDecodingException("row groups are not sorted: previous row groups starts at " + previousOffset + ", current row group starts at " + currentOffset);
    }
  }
}
 
Example #29
Source File: PlainValuesReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public double readDouble() {
  try {
    return in.readDouble();
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read double", e);
  }
}
 
Example #30
Source File: PlainValuesReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void skip(int n) {
  try {
    skipBytesFully(n * 4);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not skip " + n + " floats", e);
  }
}