org.apache.parquet.io.ParquetEncodingException Java Examples

The following examples show how to use org.apache.parquet.io.ParquetEncodingException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetWriteProtocol.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void writeFieldBegin(TField field) throws TException {
  if (field.type == TType.STOP) {
    return;
  }
  try {
    currentType = thriftFieldIdToParquetField[field.id];
    if (currentType == null) {
      throw new ParquetEncodingException("field " + field.id + " was not found in " + thriftType + " and " + schema.getType());
    }
    final int index = currentType.getIndex();
    recordConsumer.startField(currentType.getName(), index);
    currentProtocol = children[index];
  } catch (ArrayIndexOutOfBoundsException e) {
    throw new ParquetEncodingException("field " + field.id + " was not found in " + thriftType + " and " + schema.getType());
  }
}
 
Example #2
Source File: ColumnWriterBase.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Finalizes the Column chunk. Possibly adding extra pages if needed (dictionary, ...)
 * Is called right after writePage
 */
void finalizeColumnChunk() {
  final DictionaryPage dictionaryPage = dataColumn.toDictPageAndClose();
  if (dictionaryPage != null) {
    if (DEBUG)
      LOG.debug("write dictionary");
    try {
      pageWriter.writeDictionaryPage(dictionaryPage);
    } catch (IOException e) {
      throw new ParquetEncodingException("could not write dictionary page for " + path, e);
    }
    dataColumn.resetDictionary();
  }

  if (bloomFilterWriter != null && bloomFilter != null) {
    bloomFilterWriter.writeBloomFilter(bloomFilter);
  }
}
 
Example #3
Source File: ColumnWriterBase.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Writes the current data to a new page in the page store
 */
void writePage() {
  if (valueCount == 0) {
    throw new ParquetEncodingException("writing empty page");
  }
  this.rowsWrittenSoFar += pageRowCount;
  if (DEBUG)
    LOG.debug("write page");
  try {
    writePage(pageRowCount, valueCount, statistics, repetitionLevelColumn, definitionLevelColumn, dataColumn);
  } catch (IOException e) {
    throw new ParquetEncodingException("could not write page for " + path, e);
  }
  repetitionLevelColumn.reset();
  definitionLevelColumn.reset();
  dataColumn.reset();
  valueCount = 0;
  resetStatistics();
  pageRowCount = 0;
}
 
Example #4
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
static ParquetMetadata mergeFooters(Path root, List<Footer> footers) {
  String rootPath = root.toUri().getPath();
  GlobalMetaData fileMetaData = null;
  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
  for (Footer footer : footers) {
      String footerPath = footer.getFile().toUri().getPath();
    if (!footerPath.startsWith(rootPath)) {
      throw new ParquetEncodingException(footerPath + " invalid: all the files must be contained in the root " + root);
    }
    footerPath = footerPath.substring(rootPath.length());
    while (footerPath.startsWith("/")) {
      footerPath = footerPath.substring(1);
    }
    fileMetaData = mergeInto(footer.getParquetMetadata().getFileMetaData(), fileMetaData);
    for (BlockMetaData block : footer.getParquetMetadata().getBlocks()) {
      block.setPath(footerPath);
      blocks.add(block);
    }
  }
  return new ParquetMetadata(fileMetaData.merge(), blocks);
}
 
Example #5
Source File: ByteStreamSplitValuesWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public ByteStreamSplitValuesWriter(int elementSizeInBytes, int initialCapacity, int pageSize, ByteBufferAllocator allocator) {
  if (elementSizeInBytes <= 0) {
    throw new ParquetEncodingException(String.format("Element byte size is invalid: %d", elementSizeInBytes));
  }
  this.numStreams = elementSizeInBytes;
  this.elementSizeInBytes = elementSizeInBytes;
  this.byteStreams = new CapacityByteArrayOutputStream[elementSizeInBytes];

  // Round-up the capacity hint.
  final int capacityPerStream = (pageSize + this.numStreams - 1) / this.numStreams;
  final int initialCapacityPerStream = (initialCapacity + this.numStreams - 1) / this.numStreams;
  for (int i = 0; i < this.numStreams; ++i) {
    this.byteStreams[i] = new CapacityByteArrayOutputStream(
            initialCapacityPerStream, capacityPerStream, allocator);
  }
}
 
Example #6
Source File: ParquetColumnChunkPageWriteStore.java    From Bats with Apache License 2.0 5 votes vote down vote up
@Override
public void writePage(BytesInput bytes,
                      int valueCount,
                      Statistics statistics,
                      Encoding rlEncoding,
                      Encoding dlEncoding,
                      Encoding valuesEncoding) throws IOException {
  long uncompressedSize = bytes.size();
  // Parquet library creates bad metadata if the uncompressed or compressed size of a page exceeds Integer.MAX_VALUE
  if (uncompressedSize > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write page larger than Integer.MAX_VALUE bytes: " +
            uncompressedSize);
  }
  BytesInput compressedBytes = compressor.compress(bytes);
  long compressedSize = compressedBytes.size();
  if (compressedSize > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write compressed page larger than Integer.MAX_VALUE bytes: "
            + compressedSize);
  }
  parquetMetadataConverter.writeDataPageHeader(
      (int)uncompressedSize,
      (int)compressedSize,
      valueCount,
      statistics,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      buf);
  this.uncompressedLength += uncompressedSize;
  this.compressedLength += compressedSize;
  this.totalValueCount += valueCount;
  this.pageCount += 1;
  this.totalStatistics.mergeStatistics(statistics);
  compressedBytes.writeAllTo(buf);
  rlEncodings.add(rlEncoding);
  dlEncodings.add(dlEncoding);
  dataEncodings.add(valuesEncoding);
}
 
Example #7
Source File: PlainValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public BytesInput getBytes() {
  try {
    out.flush();
  } catch (IOException e) {
    throw new ParquetEncodingException("could not write page", e);
  }
  if (LOG.isDebugEnabled()) LOG.debug("writing a buffer of size {}", arrayOut.size());
  return BytesInput.from(arrayOut);
}
 
Example #8
Source File: FixedLenByteArrayPlainValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public final void writeBytes(Binary v) {
  if (v.length() != length) {
    throw new IllegalArgumentException("Fixed Binary size " + v.length() +
        " does not match field type length " + length);
  }
  try {
    v.writeTo(out);
  } catch (IOException e) {
    throw new ParquetEncodingException("could not write fixed bytes", e);
  }
}
 
Example #9
Source File: FixedLenByteArrayPlainValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public BytesInput getBytes() {
  try {
    out.flush();
  } catch (IOException e) {
    throw new ParquetEncodingException("could not write page", e);
  }
  LOG.debug("writing a buffer of size {}", arrayOut.size());
  return BytesInput.from(arrayOut);
}
 
Example #10
Source File: ColumnWriterV2.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public BytesInput getBytes() {
  try {
    return encoder.toBytes();
  } catch (IOException e) {
    throw new ParquetEncodingException(e);
  }
}
 
Example #11
Source File: Binary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static ByteBuffer encodeUTF8(CharSequence value) {
  try {
    return ENCODER.get().encode(CharBuffer.wrap(value));
  } catch (CharacterCodingException e) {
    throw new ParquetEncodingException("UTF-8 not supported.", e);
  }
}
 
Example #12
Source File: MemPageWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writePage(BytesInput bytesInput, int valueCount, Statistics statistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding)
    throws IOException {
  if (valueCount == 0) {
    throw new ParquetEncodingException("illegal page of 0 values");
  }
  memSize += bytesInput.size();
  pages.add(new DataPageV1(BytesInput.copy(bytesInput), valueCount, (int)bytesInput.size(), statistics, rlEncoding, dlEncoding, valuesEncoding));
  totalValueCount += valueCount;
  LOG.debug("page written for {} bytes and {} records", bytesInput.size(), valueCount);
}
 
Example #13
Source File: MemPageWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writePageV2(int rowCount, int nullCount, int valueCount,
    BytesInput repetitionLevels, BytesInput definitionLevels,
    Encoding dataEncoding, BytesInput data, Statistics<?> statistics) throws IOException {
  if (valueCount == 0) {
    throw new ParquetEncodingException("illegal page of 0 values");
  }
  long size = repetitionLevels.size() + definitionLevels.size() + data.size();
  memSize += size;
  pages.add(DataPageV2.uncompressed(rowCount, nullCount, valueCount, copy(repetitionLevels), copy(definitionLevels), dataEncoding, copy(data), statistics));
  totalValueCount += valueCount;
  LOG.debug("page written for {} bytes and {} records", size, valueCount);
}
 
Example #14
Source File: MemPageWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
  if (this.dictionaryPage != null) {
    throw new ParquetEncodingException("Only one dictionary page per block");
  }
  this.memSize += dictionaryPage.getBytes().size();
  this.dictionaryPage = dictionaryPage.copy();
  LOG.debug("dictionary page written for {} bytes and {} records", dictionaryPage.getBytes().size(), dictionaryPage.getDictionarySize());
}
 
Example #15
Source File: ColumnChunkPageWriteStore.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private int toIntWithCheck(long size) {
  if (size > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write page larger than " + Integer.MAX_VALUE + " bytes: " +
            size);
  }
  return (int)size;
}
 
Example #16
Source File: ColumnChunkPageWriteStore.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
  if (this.dictionaryPage != null) {
    throw new ParquetEncodingException("Only one dictionary page is allowed");
  }
  BytesInput dictionaryBytes = dictionaryPage.getBytes();
  int uncompressedSize = (int)dictionaryBytes.size();
  BytesInput compressedBytes = compressor.compress(dictionaryBytes);
  this.dictionaryPage = new DictionaryPage(BytesInput.copy(compressedBytes), uncompressedSize, dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding());
}
 
Example #17
Source File: ScroogeWriteSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void write(T record) {
  try {
    record.write(parquetWriteProtocol);
  } catch (TException e) {
    throw new ParquetEncodingException(e);
  }
}
 
Example #18
Source File: ThriftBytesWriteSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void write(BytesWritable record) {
  try {
    readToWrite.readOne(protocol(record), parquetWriteProtocol);
  } catch (TException e) {
    throw new ParquetEncodingException(e);
  }
}
 
Example #19
Source File: TBaseWriteSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void write(T record) {
  try {
    record.write(parquetWriteProtocol);
  } catch (TException e) {
    throw new ParquetEncodingException(e);
  }
}
 
Example #20
Source File: ParquetThriftStorer.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public void putNext(Tuple tuple) throws IOException {
  try {
    this.recordWriter.write(null, tuple);
  } catch (InterruptedException e) {
    throw new ParquetEncodingException("Interrupted while writing", e);
  }
}
 
Example #21
Source File: ParquetWriteProtocol.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writeI32(int i32) throws TException {
  start();
  EnumValue value = type.getEnumValueById(i32);
  if (value == null) {
    throw new ParquetEncodingException("Can not find enum value of index " + i32 + " for field:" + columnIO.toString());
  }
  recordConsumer.addBinary(Binary.fromString(value.getName()));
  end();
}
 
Example #22
Source File: DataWritableWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void writeData(final ArrayWritable arr, final GroupType type) {
  if (arr == null) {
    return;
  }
  final int fieldCount = type.getFieldCount();
  Writable[] values = arr.get();
  for (int field = 0; field < fieldCount; ++field) {
    final Type fieldType = type.getType(field);
    final String fieldName = fieldType.getName();
    final Writable value = values[field];
    if (value == null) {
      continue;
    }
    recordConsumer.startField(fieldName, field);

    if (fieldType.isPrimitive()) {
      writePrimitive(value);
    } else {
      recordConsumer.startGroup();
      if (value instanceof ArrayWritable) {
        if (fieldType.asGroupType().getRepetition().equals(Type.Repetition.REPEATED)) {
          writeArray((ArrayWritable) value, fieldType.asGroupType());
        } else {
          writeData((ArrayWritable) value, fieldType.asGroupType());
        }
      } else if (value != null) {
        throw new ParquetEncodingException("This should be an ArrayWritable or MapWritable: " + value);
      }

      recordConsumer.endGroup();
    }

    recordConsumer.endField(fieldName, field);
  }
}
 
Example #23
Source File: ParquetStorer.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private Schema getSchema() {
  try {
    final String schemaString = getProperties().getProperty(SCHEMA);
    if (schemaString == null) {
      throw new ParquetEncodingException("Can not store relation in Parquet as the schema is unknown");
    }
    return Utils.getSchemaFromString(schemaString);
  } catch (ParserException e) {
    throw new ParquetEncodingException("can not get schema from context", e);
  }
}
 
Example #24
Source File: ParquetStorer.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * {@inheritDoc}
 */
@Override
public void putNext(Tuple tuple) throws IOException {
  try {
    this.recordWriter.write(null, tuple);
  } catch (InterruptedException e) {
    Thread.interrupted();
    throw new ParquetEncodingException("Interrupted while writing", e);
  }
}
 
Example #25
Source File: PlainValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public final void writeDouble(double v) {
  try {
    out.writeDouble(v);
  } catch (IOException e) {
    throw new ParquetEncodingException("could not write double", e);
  }
}
 
Example #26
Source File: ParquetColumnChunkPageWriteStore.java    From Bats with Apache License 2.0 5 votes vote down vote up
private int toIntWithCheck(long size) {
  if (size > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write page larger than " + Integer.MAX_VALUE + " bytes: " +
            size);
  }
  return (int)size;
}
 
Example #27
Source File: ParquetColumnChunkPageWriteStore.java    From Bats with Apache License 2.0 5 votes vote down vote up
@Override
public void writeDictionaryPage(DictionaryPage dictionaryPage) throws IOException {
  if (this.dictionaryPage != null) {
    throw new ParquetEncodingException("Only one dictionary page is allowed");
  }
  BytesInput dictionaryBytes = dictionaryPage.getBytes();
  int uncompressedSize = (int)dictionaryBytes.size();
  BytesInput compressedBytes = compressor.compress(dictionaryBytes);
  this.dictionaryPage = new DictionaryPage(BytesInput.copy(compressedBytes), uncompressedSize,
      dictionaryPage.getDictionarySize(), dictionaryPage.getEncoding());
}
 
Example #28
Source File: ParquetFilePOJOReaderTest.java    From attic-apex-malhar with Apache License 2.0 5 votes vote down vote up
@Override
public void write(Object record)
{
  recordConsumer.startMessage();
  for (int i = 0; i < cols.size(); ++i) {
    String val = keyMethodMap.get(i).get(record).toString();
    recordConsumer.startField(cols.get(i).getPath()[0], i);
    switch (cols.get(i).getType()) {
      case BOOLEAN:
        recordConsumer.addBoolean(Boolean.parseBoolean(val));
        break;
      case FLOAT:
        recordConsumer.addFloat(Float.parseFloat(val));
        break;
      case DOUBLE:
        recordConsumer.addDouble(Double.parseDouble(val));
        break;
      case INT32:
        recordConsumer.addInteger(Integer.parseInt(val));
        break;
      case INT64:
        recordConsumer.addLong(Long.parseLong(val));
        break;
      case BINARY:
        recordConsumer.addBinary(stringToBinary(val));
        break;
      default:
        throw new ParquetEncodingException("Unsupported column type: " + cols.get(i).getType());
    }
    recordConsumer.endField(cols.get(i).getPath()[0], i);
  }
  recordConsumer.endMessage();
}
 
Example #29
Source File: DictionaryValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public BytesInput getBytes() {
  int maxDicId = getDictionarySize() - 1;
  LOG.debug("max dic id {}", maxDicId);
  int bitWidth = BytesUtils.getWidthFromMaxInt(maxDicId);
  int initialSlabSize =
      CapacityByteArrayOutputStream.initialSlabSizeHeuristic(MIN_INITIAL_SLAB_SIZE, maxDictionaryByteSize, 10);

  RunLengthBitPackingHybridEncoder encoder =
      new RunLengthBitPackingHybridEncoder(bitWidth, initialSlabSize, maxDictionaryByteSize, this.allocator);
  encoders.add(encoder);
  IntIterator iterator = encodedValues.iterator();
  try {
    while (iterator.hasNext()) {
      encoder.writeInt(iterator.next());
    }
    // encodes the bit width
    byte[] bytesHeader = new byte[] { (byte) bitWidth };
    BytesInput rleEncodedBytes = encoder.toBytes();
    LOG.debug("rle encoded bytes {}", rleEncodedBytes.size());
    BytesInput bytes = concat(BytesInput.from(bytesHeader), rleEncodedBytes);
    // remember size of dictionary when we last wrote a page
    lastUsedDictionarySize = getDictionarySize();
    lastUsedDictionaryByteSize = dictionaryByteSize;
    return bytes;
  } catch (IOException e) {
    throw new ParquetEncodingException("could not encode the values", e);
  }
}
 
Example #30
Source File: DeltaLengthByteArrayValuesWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void writeBytes(Binary v) {
  try {
    lengthWriter.writeInteger(v.length());
    v.writeTo(out);
  } catch (IOException e) {
    throw new ParquetEncodingException("could not write bytes", e);
  }
}