org.apache.parquet.column.Encoding Java Examples

The following examples show how to use org.apache.parquet.column.Encoding. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: EncodingList.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public boolean equals(Object obj) {
  if (obj instanceof EncodingList) {
    List<org.apache.parquet.column.Encoding> other = ((EncodingList)obj).encodings;
    final int size = other.size();
    if (size != encodings.size()) {
      return false;
    }
    for (int i = 0; i < size; i++) {
      if (!other.get(i).equals(encodings.get(i))) {
        return false;
      }
    }
    return true;
  }
  return false;
}
 
Example #2
Source File: PredicateUtils.java    From presto with Apache License 2.0 6 votes vote down vote up
private static Optional<DictionaryPage> readDictionaryPage(byte[] data, CompressionCodecName codecName)
{
    try {
        ByteArrayInputStream inputStream = new ByteArrayInputStream(data);
        PageHeader pageHeader = Util.readPageHeader(inputStream);

        if (pageHeader.type != PageType.DICTIONARY_PAGE) {
            return Optional.empty();
        }

        Slice compressedData = wrappedBuffer(data, data.length - inputStream.available(), pageHeader.getCompressed_page_size());
        DictionaryPageHeader dicHeader = pageHeader.getDictionary_page_header();
        ParquetEncoding encoding = getParquetEncoding(Encoding.valueOf(dicHeader.getEncoding().name()));
        int dictionarySize = dicHeader.getNum_values();

        return Optional.of(new DictionaryPage(decompress(codecName, compressedData, pageHeader.getUncompressed_page_size()), dictionarySize, encoding));
    }
    catch (IOException ignored) {
        return Optional.empty();
    }
}
 
Example #3
Source File: TestCorruptDeltaByteArrays.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testCorruptDeltaByteArrayVerisons() {
  assertTrue(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.6.0 (build abcd)", Encoding.DELTA_BYTE_ARRAY));
  assertTrue(CorruptDeltaByteArrays.requiresSequentialReads((String) null, Encoding.DELTA_BYTE_ARRAY));
  assertTrue(CorruptDeltaByteArrays.requiresSequentialReads((ParsedVersion) null, Encoding.DELTA_BYTE_ARRAY));
  assertTrue(CorruptDeltaByteArrays.requiresSequentialReads((SemanticVersion) null, Encoding.DELTA_BYTE_ARRAY));
  assertTrue(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0-SNAPSHOT (build abcd)", Encoding.DELTA_BYTE_ARRAY));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.6.0 (build abcd)", Encoding.DELTA_BINARY_PACKED));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads((String) null, Encoding.DELTA_LENGTH_BYTE_ARRAY));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads((ParsedVersion) null, Encoding.PLAIN));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads((SemanticVersion) null, Encoding.RLE));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0-SNAPSHOT (build abcd)", Encoding.RLE_DICTIONARY));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0-SNAPSHOT (build abcd)", Encoding.PLAIN_DICTIONARY));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0-SNAPSHOT (build abcd)", Encoding.BIT_PACKED));
  assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0 (build abcd)", Encoding.DELTA_BYTE_ARRAY));
}
 
Example #4
Source File: PredicateUtils.java    From presto with Apache License 2.0 6 votes vote down vote up
@VisibleForTesting
@SuppressWarnings("deprecation")
static boolean isOnlyDictionaryEncodingPages(ColumnChunkMetaData columnMetaData)
{
    // Files written with newer versions of Parquet libraries (e.g. parquet-mr 1.9.0) will have EncodingStats available
    // Otherwise, fallback to v1 logic
    EncodingStats stats = columnMetaData.getEncodingStats();
    if (stats != null) {
        return stats.hasDictionaryPages() && !stats.hasNonDictionaryEncodedPages();
    }

    Set<Encoding> encodings = columnMetaData.getEncodings();
    if (encodings.contains(PLAIN_DICTIONARY)) {
        // PLAIN_DICTIONARY was present, which means at least one page was
        // dictionary-encoded and 1.0 encodings are used
        // The only other allowed encodings are RLE and BIT_PACKED which are used for repetition or definition levels
        return Sets.difference(encodings, ImmutableSet.of(PLAIN_DICTIONARY, RLE, BIT_PACKED)).isEmpty();
    }

    return false;
}
 
Example #5
Source File: ParquetColumnChunk.java    From presto with Apache License 2.0 6 votes vote down vote up
private long readDataPageV1(
        PageHeader pageHeader,
        int uncompressedPageSize,
        int compressedPageSize,
        List<DataPage> pages)
{
    DataPageHeader dataHeaderV1 = pageHeader.getData_page_header();
    pages.add(new DataPageV1(
            getSlice(compressedPageSize),
            dataHeaderV1.getNum_values(),
            uncompressedPageSize,
            getParquetEncoding(Encoding.valueOf(dataHeaderV1.getRepetition_level_encoding().name())),
            getParquetEncoding(Encoding.valueOf(dataHeaderV1.getDefinition_level_encoding().name())),
            getParquetEncoding(Encoding.valueOf(dataHeaderV1.getEncoding().name()))));
    return dataHeaderV1.getNum_values();
}
 
Example #6
Source File: Util.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
public static String encodingAsString(Encoding encoding, boolean isDict) {
  switch (encoding) {
    case PLAIN:
      return "_";
    case PLAIN_DICTIONARY:
      // data pages use RLE, dictionary pages use plain
      return isDict ? "_" : "R";
    case RLE_DICTIONARY:
      return "R";
    case DELTA_BINARY_PACKED:
    case DELTA_LENGTH_BYTE_ARRAY:
    case DELTA_BYTE_ARRAY:
      return "D";
    default:
      return "?";
  }
}
 
Example #7
Source File: ParquetColumnChunk.java    From presto with Apache License 2.0 6 votes vote down vote up
private long readDataPageV2(
        PageHeader pageHeader,
        int uncompressedPageSize,
        int compressedPageSize,
        List<DataPage> pages)
{
    DataPageHeaderV2 dataHeaderV2 = pageHeader.getData_page_header_v2();
    int dataSize = compressedPageSize - dataHeaderV2.getRepetition_levels_byte_length() - dataHeaderV2.getDefinition_levels_byte_length();
    pages.add(new DataPageV2(
            dataHeaderV2.getNum_rows(),
            dataHeaderV2.getNum_nulls(),
            dataHeaderV2.getNum_values(),
            getSlice(dataHeaderV2.getRepetition_levels_byte_length()),
            getSlice(dataHeaderV2.getDefinition_levels_byte_length()),
            getParquetEncoding(Encoding.valueOf(dataHeaderV2.getEncoding().name())),
            getSlice(dataSize),
            uncompressedPageSize,
            MetadataReader.readStats(
                    fileCreatedBy,
                    Optional.ofNullable(dataHeaderV2.getStatistics()),
                    descriptor.getColumnDescriptor().getPrimitiveType()),
            dataHeaderV2.isIs_compressed()));
    return dataHeaderV2.getNum_values();
}
 
Example #8
Source File: TestPredicateUtils.java    From presto with Apache License 2.0 6 votes vote down vote up
@Test
@SuppressWarnings("deprecation")
public void testDictionaryEncodingV1()
{
    Set<Encoding> required = ImmutableSet.of(BIT_PACKED);
    Set<Encoding> optional = ImmutableSet.of(BIT_PACKED, RLE);
    Set<Encoding> repeated = ImmutableSet.of(RLE);

    Set<Encoding> notDictionary = ImmutableSet.of(PLAIN);
    Set<Encoding> mixedDictionary = ImmutableSet.of(PLAIN_DICTIONARY, PLAIN);
    Set<Encoding> dictionary = ImmutableSet.of(PLAIN_DICTIONARY);

    assertFalse(isOnlyDictionaryEncodingPages(createColumnMetaDataV1(union(required, notDictionary))), "required notDictionary");
    assertFalse(isOnlyDictionaryEncodingPages(createColumnMetaDataV1(union(optional, notDictionary))), "optional notDictionary");
    assertFalse(isOnlyDictionaryEncodingPages(createColumnMetaDataV1(union(repeated, notDictionary))), "repeated notDictionary");
    assertFalse(isOnlyDictionaryEncodingPages(createColumnMetaDataV1(union(required, mixedDictionary))), "required mixedDictionary");
    assertFalse(isOnlyDictionaryEncodingPages(createColumnMetaDataV1(union(optional, mixedDictionary))), "optional mixedDictionary");
    assertFalse(isOnlyDictionaryEncodingPages(createColumnMetaDataV1(union(repeated, mixedDictionary))), "repeated mixedDictionary");
    assertTrue(isOnlyDictionaryEncodingPages(createColumnMetaDataV1(union(required, dictionary))), "required dictionary");
    assertTrue(isOnlyDictionaryEncodingPages(createColumnMetaDataV1(union(optional, dictionary))), "optional dictionary");
    assertTrue(isOnlyDictionaryEncodingPages(createColumnMetaDataV1(union(repeated, dictionary))), "repeated dictionary");
}
 
Example #9
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private FileMetaData metadata(long... sizes) {
  List<SchemaElement> schema = emptyList();
  List<RowGroup> rowGroups = new ArrayList<RowGroup>();
  long offset = 0;
  for (long size : sizes) {
    ColumnChunk columnChunk = new ColumnChunk(offset);
    columnChunk.setMeta_data(new ColumnMetaData(
        INT32,
        Collections.<org.apache.parquet.format.Encoding>emptyList(),
        Collections.<String>emptyList(),
        UNCOMPRESSED, 10l, size * 2, size, offset));
    rowGroups.add(new RowGroup(Arrays.asList(columnChunk), size, 1));
    offset += size;
  }
  return new FileMetaData(1, schema, sizes.length, rowGroups);
}
 
Example #10
Source File: ParquetRecordReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void checkDeltaByteArrayProblem(FileMetaData meta, Configuration conf, BlockMetaData block) {
  // splitting files?
  if (conf.getBoolean(ParquetInputFormat.SPLIT_FILES, true)) {
    // this is okay if not using DELTA_BYTE_ARRAY with the bug
    Set<Encoding> encodings = new HashSet<Encoding>();
    for (ColumnChunkMetaData column : block.getColumns()) {
      encodings.addAll(column.getEncodings());
    }
    for (Encoding encoding : encodings) {
      if (CorruptDeltaByteArrays.requiresSequentialReads(meta.getCreatedBy(), encoding)) {
        throw new ParquetDecodingException("Cannot read data due to " +
            "PARQUET-246: to read safely, set " + SPLIT_FILES + " to false");
      }
    }
  }
}
 
Example #11
Source File: ParquetFileReader.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * Reads and decompresses a dictionary page for the given column chunk.
 *
 * Returns null if the given column chunk has no dictionary page.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an uncompressed DictionaryPage or null
 * @throws IOException if there is an error while reading the dictionary
 */
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
  if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
      !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
    return null;
  }

  // TODO: this should use getDictionaryPageOffset() but it isn't reliable.
  if (f.getPos() != meta.getStartingPos()) {
    f.seek(meta.getStartingPos());
  }

  PageHeader pageHeader = Util.readPageHeader(f);
  if (!pageHeader.isSetDictionary_page_header()) {
    return null; // TODO: should this complain?
  }

  DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
  BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());

  return new DictionaryPage(
      decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
      compressedPage.getDictionarySize(),
      compressedPage.getEncoding());
}
 
Example #12
Source File: ColumnWriterV2.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
void writePage(int rowCount, int valueCount, Statistics<?> statistics, ValuesWriter repetitionLevels,
    ValuesWriter definitionLevels, ValuesWriter values) throws IOException {
  // TODO: rework this API. The bytes shall be retrieved before the encoding (encoding might be different otherwise)
  BytesInput bytes = values.getBytes();
  Encoding encoding = values.getEncoding();
  pageWriter.writePageV2(
      rowCount,
      Math.toIntExact(statistics.getNumNulls()),
      valueCount,
      repetitionLevels.getBytes(),
      definitionLevels.getBytes(),
      encoding,
      bytes,
      statistics);
}
 
Example #13
Source File: AbstractColumnReader.java    From flink with Apache License 2.0 6 votes vote down vote up
private void readPageV1(DataPageV1 page) throws IOException {
	this.pageValueCount = page.getValueCount();
	ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL);

	// Initialize the decoders.
	if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) {
		throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding());
	}
	int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
	this.runLenDecoder = new RunLengthDecoder(bitWidth);
	try {
		BytesInput bytes = page.getBytes();
		ByteBufferInputStream in = bytes.toInputStream();
		rlReader.initFromPage(pageValueCount, in);
		this.runLenDecoder.initFromStream(pageValueCount, in);
		prepareNewPage(page.getValueEncoding(), in);
	} catch (IOException e) {
		throw new IOException("could not read page " + page + " in col " + descriptor, e);
	}
}
 
Example #14
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testParquetMetadataConverterWithoutDictionary()
  throws IOException {
  ParquetMetadata parquetMetaData =
    createParquetMetaData(null, Encoding.PLAIN);

  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData);

  // Flag should be false
  fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> {
    assertFalse(column.meta_data.isSetDictionary_page_offset());
  }));

  ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream();
  Util.writeFileMetaData(fmd1, metaDataOutputStream);
  ByteArrayInputStream metaDataInputStream =
    new ByteArrayInputStream(metaDataOutputStream.toByteArray());
  FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream);
  ParquetMetadata pmd2 = converter.fromParquetMetadata(fmd2);

  long dicOffsetConverted =
    pmd2.getBlocks().get(0).getColumns().get(0).getDictionaryPageOffset();

  Assert.assertEquals(0, dicOffsetConverted);
}
 
Example #15
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * start a column inside a block
 * @param descriptor the column descriptor
 * @param valueCount the value count in this column
 * @param compressionCodecName a compression codec name
 * @throws IOException if there is an error while writing
 */
public void startColumn(ColumnDescriptor descriptor,
                        long valueCount,
                        CompressionCodecName compressionCodecName) throws IOException {
  state = state.startColumn();
  encodingStatsBuilder.clear();
  currentEncodings = new HashSet<Encoding>();
  currentChunkPath = ColumnPath.get(descriptor.getPath());
  currentChunkType = descriptor.getPrimitiveType();
  currentChunkCodec = compressionCodecName;
  currentChunkValueCount = valueCount;
  currentChunkFirstDataPage = out.getPos();
  compressedLength = 0;
  uncompressedLength = 0;
  // The statistics will be copied from the first one added at writeDataPage(s) so we have the correct typed one
  currentStatistics = null;

  columnIndexBuilder = ColumnIndexBuilder.getBuilder(currentChunkType, columnIndexTruncateLength);
  offsetIndexBuilder = OffsetIndexBuilder.getBuilder();
  firstPageOffset = -1;
}
 
Example #16
Source File: TestStatisticsFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static ColumnChunkMetaData getIntColumnMeta(org.apache.parquet.column.statistics.Statistics<?> stats,
    long valueCount) {
  return ColumnChunkMetaData.get(ColumnPath.get("int", "column"),
      PrimitiveTypeName.INT32,
      CompressionCodecName.GZIP,
      new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
      stats,
      0L, 0L, valueCount, 0L, 0L);
}
 
Example #17
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Writes a single page
 * @param valueCount count of values
 * @param uncompressedPageSize the size of the data once uncompressed
 * @param bytes the compressed data for the page without header
 * @param statistics the statistics of the page
 * @param rowCount the number of rows in the page
 * @param rlEncoding encoding of the repetition level
 * @param dlEncoding encoding of the definition level
 * @param valuesEncoding encoding of values
 * @throws IOException if any I/O error occurs during writing the file
 */
public void writeDataPage(
    int valueCount, int uncompressedPageSize,
    BytesInput bytes,
    Statistics statistics,
    long rowCount,
    Encoding rlEncoding,
    Encoding dlEncoding,
    Encoding valuesEncoding) throws IOException {
  long beforeHeader = out.getPos();
  innerWriteDataPage(valueCount, uncompressedPageSize, bytes, statistics, rlEncoding, dlEncoding, valuesEncoding);

  offsetIndexBuilder.add((int) (out.getPos() - beforeHeader), rowCount);
}
 
Example #18
Source File: DataPageV1.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param bytes the bytes for this page
 * @param valueCount count of values in this page
 * @param uncompressedSize the uncompressed size of the page
 * @param statistics of the page's values (max, min, num_null)
 * @param rlEncoding the repetition level encoding for this page
 * @param dlEncoding the definition level encoding for this page
 * @param valuesEncoding the values encoding for this page
 */
public DataPageV1(BytesInput bytes, int valueCount, int uncompressedSize, Statistics<?> statistics, Encoding rlEncoding, Encoding dlEncoding, Encoding valuesEncoding) {
  super(Math.toIntExact(bytes.size()), uncompressedSize, valueCount);
  this.bytes = bytes;
  this.statistics = statistics;
  this.rlEncoding = rlEncoding;
  this.dlEncoding = dlEncoding;
  this.valuesEncoding = valuesEncoding;
  this.indexRowCount = -1;
}
 
Example #19
Source File: ColumnChunkMetaData.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static ColumnChunkMetaData get(
    ColumnPath path,
    PrimitiveType type,
    CompressionCodecName codec,
    EncodingStats encodingStats,
    Set<Encoding> encodings,
    Statistics statistics,
    long firstDataPage,
    long dictionaryPageOffset,
    long valueCount,
    long totalSize,
    long totalUncompressedSize) {
  // to save space we store those always positive longs in ints when they fit.
  if (positiveLongFitsInAnInt(firstDataPage)
      && positiveLongFitsInAnInt(dictionaryPageOffset)
      && positiveLongFitsInAnInt(valueCount)
      && positiveLongFitsInAnInt(totalSize)
      && positiveLongFitsInAnInt(totalUncompressedSize)) {
    return new IntColumnChunkMetaData(
        path, type, codec,
        encodingStats, encodings,
        statistics,
        firstDataPage,
        dictionaryPageOffset,
        valueCount,
        totalSize,
        totalUncompressedSize);
  } else {
    return new LongColumnChunkMetaData(
        path, type, codec,
        encodingStats, encodings,
        statistics,
        firstDataPage,
        dictionaryPageOffset,
        valueCount,
        totalSize,
        totalUncompressedSize);
  }
}
 
Example #20
Source File: DataPageV2.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param rowCount count of rows
 * @param nullCount count of nulls
 * @param valueCount count of values
 * @param firstRowIndex the index of the first row in this page
 * @param repetitionLevels RLE encoded repetition levels
 * @param definitionLevels RLE encoded definition levels
 * @param dataEncoding encoding for the data
 * @param data data encoded with dataEncoding
 * @param statistics optional statistics for this page
 * @return an uncompressed page
 */
public static DataPageV2 uncompressed(
    int rowCount, int nullCount, int valueCount, long firstRowIndex,
    BytesInput repetitionLevels, BytesInput definitionLevels,
    Encoding dataEncoding, BytesInput data,
    Statistics<?> statistics) {
  return new DataPageV2(
      rowCount, nullCount, valueCount, firstRowIndex,
      repetitionLevels, definitionLevels,
      dataEncoding, data,
      Math.toIntExact(repetitionLevels.size() + definitionLevels.size() + data.size()),
      statistics,
      false);
}
 
Example #21
Source File: DataPageV2.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * @param rowCount count of rows
 * @param nullCount count of nulls
 * @param valueCount count of values
 * @param repetitionLevels RLE encoded repetition levels
 * @param definitionLevels RLE encoded definition levels
 * @param dataEncoding encoding for the data
 * @param data data encoded with dataEncoding and compressed
 * @param uncompressedSize total size uncompressed (rl + dl + data)
 * @param statistics optional statistics for this page
 * @return a compressed page
 */
public static DataPageV2 compressed(
    int rowCount, int nullCount, int valueCount,
    BytesInput repetitionLevels, BytesInput definitionLevels,
    Encoding dataEncoding, BytesInput data,
    int uncompressedSize,
    Statistics<?> statistics) {
  return new DataPageV2(
      rowCount, nullCount, valueCount,
      repetitionLevels, definitionLevels,
      dataEncoding, data,
      uncompressedSize,
      statistics,
      true);
}
 
Example #22
Source File: TestColumnChunkMetaData.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ColumnChunkMetaData newMD(long big) {
  Set<Encoding> e = new HashSet<Encoding>();
  PrimitiveTypeName t = BINARY;
  ColumnPath p = ColumnPath.get("foo");
  CompressionCodecName c = CompressionCodecName.GZIP;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s,
                                                   big, 0, 0, 0, 0);
  return md;
}
 
Example #23
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * writes a single page
 * @param valueCount count of values
 * @param uncompressedPageSize the size of the data once uncompressed
 * @param bytes the compressed data for the page without header
 * @param statistics statistics for the page
 * @param rlEncoding encoding of the repetition level
 * @param dlEncoding encoding of the definition level
 * @param valuesEncoding encoding of values
 * @throws IOException if there is an error while writing
 * @deprecated this method does not support writing column indexes; Use
 *             {@link #writeDataPage(int, int, BytesInput, Statistics, long, Encoding, Encoding, Encoding)} instead
 */
@Deprecated
public void writeDataPage(
    int valueCount, int uncompressedPageSize,
    BytesInput bytes,
    Statistics statistics,
    Encoding rlEncoding,
    Encoding dlEncoding,
    Encoding valuesEncoding) throws IOException {
  // We are unable to build indexes without rowCount so skip them for this column
  offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder();
  columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder();
  innerWriteDataPage(valueCount, uncompressedPageSize, bytes, statistics, rlEncoding, dlEncoding, valuesEncoding);
}
 
Example #24
Source File: DictionaryFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("deprecation")
private static boolean hasNonDictionaryPages(ColumnChunkMetaData meta) {
  EncodingStats stats = meta.getEncodingStats();
  if (stats != null) {
    return stats.hasNonDictionaryEncodedPages();
  }

  // without EncodingStats, fall back to testing the encoding list
  Set<Encoding> encodings = new HashSet<Encoding>(meta.getEncodings());
  if (encodings.remove(Encoding.PLAIN_DICTIONARY)) {
    // if remove returned true, PLAIN_DICTIONARY was present, which means at
    // least one page was dictionary encoded and 1.0 encodings are used

    // RLE and BIT_PACKED are only used for repetition or definition levels
    encodings.remove(Encoding.RLE);
    encodings.remove(Encoding.BIT_PACKED);

    if (encodings.isEmpty()) {
      return false; // no encodings other than dictionary or rep/def levels
    }

    return true;

  } else {
    // if PLAIN_DICTIONARY wasn't present, then either the column is not
    // dictionary-encoded, or the 2.0 encoding, RLE_DICTIONARY, was used.
    // for 2.0, this cannot determine whether a page fell back without
    // page encoding stats
    return true;
  }
}
 
Example #25
Source File: TestStatisticsFilter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static ColumnChunkMetaData getDoubleColumnMeta(org.apache.parquet.column.statistics.Statistics<?> stats,
    long valueCount) {
  return ColumnChunkMetaData.get(ColumnPath.get("double", "column"),
      PrimitiveTypeName.DOUBLE,
      CompressionCodecName.GZIP,
      new HashSet<Encoding>(Arrays.asList(Encoding.PLAIN)),
      stats,
      0L, 0L, valueCount, 0L, 0L);
}
 
Example #26
Source File: DefaultValuesWriterFactory.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
static ValuesWriter dictWriterWithFallBack(ColumnDescriptor path, ParquetProperties parquetProperties, Encoding dictPageEncoding, Encoding dataPageEncoding, ValuesWriter writerToFallBackTo) {
  if (parquetProperties.isDictionaryEnabled(path)) {
    return FallbackValuesWriter.of(
      dictionaryWriter(path, parquetProperties, dictPageEncoding, dataPageEncoding),
      writerToFallBackTo);
  } else {
    return writerToFallBackTo;
  }
}
 
Example #27
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testParquetMetadataConverterWithDictionary()
  throws IOException {
  ParquetMetadata parquetMetaData =
    createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN);

  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData);

  // Flag should be true
  fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> {
    assertTrue(column.meta_data.isSetDictionary_page_offset());
  }));

  ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream();
  Util.writeFileMetaData(fmd1, metaDataOutputStream);
  ByteArrayInputStream metaDataInputStream =
    new ByteArrayInputStream(metaDataOutputStream.toByteArray());
  FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream);
  ParquetMetadata parquetMetaDataConverted =
    converter.fromParquetMetadata(fmd2);

  long dicOffsetOriginal =
    parquetMetaData.getBlocks().get(0).getColumns().get(0)
      .getDictionaryPageOffset();
  long dicOffsetConverted =
    parquetMetaDataConverted.getBlocks().get(0).getColumns().get(0)
      .getDictionaryPageOffset();

  Assert.assertEquals(dicOffsetOriginal, dicOffsetConverted);
}
 
Example #28
Source File: Util.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public static String encodingStatsAsString(EncodingStats encodingStats) {
  StringBuilder sb = new StringBuilder();
  if (encodingStats.hasDictionaryPages()) {
    for (Encoding encoding: encodingStats.getDictionaryEncodings()) {
      sb.append(encodingAsString(encoding, true));
    }
    sb.append(" ");
  } else {
    sb.append("  ");
  }

  Set<Encoding> encodings = encodingStats.getDataEncodings();
  if (encodings.contains(RLE_DICTIONARY) || encodings.contains(PLAIN_DICTIONARY)) {
    sb.append("R");
  }
  if (encodings.contains(PLAIN)) {
    sb.append("_");
  }
  if (encodings.contains(DELTA_BYTE_ARRAY) ||
      encodings.contains(DELTA_BINARY_PACKED) ||
      encodings.contains(DELTA_LENGTH_BYTE_ARRAY)) {
    sb.append("D");
  }

  // Check for fallback and add a flag
  if (encodingStats.hasDictionaryEncodedPages() && encodingStats.hasNonDictionaryEncodedPages()) {
    sb.append(" F");
  }

  return sb.toString();
}
 
Example #29
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * writes a single page
 * @param valueCount count of values
 * @param uncompressedPageSize the size of the data once uncompressed
 * @param bytes the compressed data for the page without header
 * @param rlEncoding encoding of the repetition level
 * @param dlEncoding encoding of the definition level
 * @param valuesEncoding encoding of values
 * @throws IOException if there is an error while writing
 */
@Deprecated
public void writeDataPage(
    int valueCount, int uncompressedPageSize,
    BytesInput bytes,
    Encoding rlEncoding,
    Encoding dlEncoding,
    Encoding valuesEncoding) throws IOException {
  state = state.write();
  // We are unable to build indexes without rowCount so skip them for this column
  offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder();
  columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder();
  long beforeHeader = out.getPos();
  LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
  int compressedPageSize = (int)bytes.size();
  metadataConverter.writeDataPageV1Header(
      uncompressedPageSize, compressedPageSize,
      valueCount,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      out);
  long headerSize = out.getPos() - beforeHeader;
  this.uncompressedLength += uncompressedPageSize + headerSize;
  this.compressedLength += compressedPageSize + headerSize;
  LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
  bytes.writeAllTo(out);
  encodingStatsBuilder.addDataEncoding(valuesEncoding);
  currentEncodings.add(rlEncoding);
  currentEncodings.add(dlEncoding);
  currentEncodings.add(valuesEncoding);
}
 
Example #30
Source File: TestParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ColumnChunkMetaData createColumnChunkMetaData() {
  Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
  PrimitiveTypeName t = PrimitiveTypeName.BINARY;
  ColumnPath p = ColumnPath.get("foo");
  CompressionCodecName c = CompressionCodecName.GZIP;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s,
          0, 0, 0, 0, 0);
  return md;
}