org.apache.parquet.column.Dictionary Java Examples

The following examples show how to use org.apache.parquet.column.Dictionary. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestGlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
@Test
public void testLocalDictionaries() throws IOException {
  try (final BufferAllocator bufferAllocator = allocatorRule.newAllocator("test-global-dictionary-builder", 0, Long.MAX_VALUE)) {
    final CompressionCodecFactory codecFactory = CodecFactory.createDirectCodecFactory(conf, new ParquetDirectByteBufferAllocator(bufferAllocator), 0);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries1 =
      LocalDictionariesReader.readDictionaries(fs, tableDirPath.resolve("phonebook1.parquet"), codecFactory);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries2 =
      LocalDictionariesReader.readDictionaries(fs, tableDirPath.resolve("phonebook2.parquet"), codecFactory);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries3 =
      LocalDictionariesReader.readDictionaries(fs, tableDirPath.resolve("phonebook3.parquet"), codecFactory);
    Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries4 =
      LocalDictionariesReader.readDictionaries(fs, partitionDirPath.resolve("phonebook4.parquet"), codecFactory);

    assertEquals(2, dictionaries1.getKey().size()); // name and kind have dictionaries
    assertEquals(1, dictionaries2.getKey().size());
    assertEquals(1, dictionaries3.getKey().size());
    assertEquals(1, dictionaries4.getKey().size());

    assertEquals(0, dictionaries1.getValue().size());
    assertEquals(1, dictionaries2.getValue().size()); // skip name
    assertEquals(1, dictionaries3.getValue().size()); // skip name
    assertEquals(1, dictionaries4.getValue().size()); // skip name
  }
}
 
Example #2
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 6 votes vote down vote up
/**
 * Builds a global dictionary for parquet table for BINARY or FIXED_LEN_BYTE_ARRAY column types.
 * It will remove exiting dictionaries if present and create new ones.
 * @param codec compression codec factory
 * @param fs filesystem
 * @param tableDir root directory for given table that has parquet files
 * @param bufferAllocator memory allocator
 * @return GlobalDictionariesInfo that has dictionary version, root path and columns along with path to dictionary files.
 * @throws IOException
 */
public static GlobalDictionariesInfo createGlobalDictionaries(CompressionCodecFactory codecFactory,
    FileSystem fs, Path tableDir, BufferAllocator bufferAllocator) throws IOException {
  final Map<ColumnDescriptor, Path> globalDictionaries = Maps.newHashMap();
  final Map<ColumnDescriptor, List<Dictionary>> allDictionaries;
  try (final DirectoryStream<FileAttributes> stream = fs.list(tableDir, PARQUET_FILES_FILTER)) {
    allDictionaries = readLocalDictionaries(codecFactory, fs, stream, bufferAllocator);
  }

  final long dictionaryVersion = getDictionaryVersion(fs, tableDir) + 1;
  final Path tmpDictionaryRootDir = createTempRootDir(fs, tableDir, dictionaryVersion);
  logger.debug("Building global dictionaries for columns {} with version {}", allDictionaries.keySet(), dictionaryVersion);

  // Sort all local dictionaries and write it to file with an index if needed
  for (Map.Entry<ColumnDescriptor, List<Dictionary>> entry : allDictionaries.entrySet()) {
    final ColumnDescriptor columnDescriptor = entry.getKey();
    final Path dictionaryFile = dictionaryFilePath(tmpDictionaryRootDir, columnDescriptor);
    logger.debug("Creating a new global dictionary for {} with version {}", columnDescriptor.toString(), dictionaryVersion);
    createDictionaryFile(fs, dictionaryFile, columnDescriptor, entry.getValue(), null, bufferAllocator);
    globalDictionaries.put(columnDescriptor, dictionaryFile);
  }
  final Path finalDictionaryRootDir = createDictionaryVersionedRootPath(fs, tableDir, dictionaryVersion, tmpDictionaryRootDir);
  return new GlobalDictionariesInfo(globalDictionaries, finalDictionaryRootDir,  dictionaryVersion);
}
 
Example #3
Source File: ParquetUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) {
  DictionaryPage dictionaryPage = pageSource.readDictionaryPage();
  if (dictionaryPage != null) {
    try {
      return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage);
    } catch (IOException e) {
      throw new ParquetDecodingException("could not decode the dictionary for " + desc, e);
    }
  }
  return null;
}
 
Example #4
Source File: LocalDictionariesReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  try (final BufferAllocator bufferAllocator = new RootAllocator(VM.getMaxDirectMemory())) {
    final Configuration fsConf = new Configuration();
    final FileSystem fs = HadoopFileSystem.getLocal(fsConf);
    final Path filePath = Path.of(args[0]);
    final CompressionCodecFactory codecFactory = CodecFactory.createDirectCodecFactory(fsConf, new ParquetDirectByteBufferAllocator(bufferAllocator), 0);
    final Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> dictionaries = readDictionaries(fs, filePath, codecFactory);
    for (Map.Entry<ColumnDescriptor, Dictionary> entry :  dictionaries.getLeft().entrySet()) {
      printDictionary(entry.getKey(), entry.getValue());
    }
    System.out.println("Binary columns which are not dictionary encoded: " + dictionaries.getRight());
  } catch (IOException ioe) {
    logger.error("Failed ", ioe);
  }
}
 
Example #5
Source File: LocalDictionariesReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static Dictionary readDictionary(FSInputStream in, ColumnDescriptor column, PageHeaderWithOffset pageHeader, BytesInputDecompressor decompressor) throws IOException {
  in.setPosition(pageHeader.getOffset());
  final byte[] data = new byte[pageHeader.getPageHeader().getCompressed_page_size()];
  int read = in.read(data);
  if (read != data.length) {
    throw new IOException(format("Failed to read dictionary page, read %d bytes, expected %d", read, data.length));
  }
  final DictionaryPage dictionaryPage = new DictionaryPage(
    decompressor.decompress(BytesInput.from(data), pageHeader.getPageHeader().getUncompressed_page_size()),
    pageHeader.getPageHeader().getDictionary_page_header().getNum_values(),
    CONVERTER.getEncoding(pageHeader.getPageHeader().getDictionary_page_header().getEncoding()));
  return dictionaryPage.getEncoding().initDictionary(column, dictionaryPage);
}
 
Example #6
Source File: VectorizedDictionaryEncodedParquetValuesReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
void readBatchOfDictionaryEncodedLongs(FieldVector vector, int startOffset, int numValuesToRead, Dictionary dict,
                                       NullabilityHolder nullabilityHolder, int typeWidth) {
  int left = numValuesToRead;
  int idx = startOffset;
  while (left > 0) {
    if (this.currentCount == 0) {
      this.readNextGroup();
    }
    int numValues = Math.min(left, this.currentCount);
    switch (mode) {
      case RLE:
        for (int i = 0; i < numValues; i++) {
          vector.getDataBuffer().setLong(idx * typeWidth, dict.decodeToLong(currentValue));
          setNotNull(vector, nullabilityHolder, idx);
          idx++;
        }
        break;
      case PACKED:
        for (int i = 0; i < numValues; i++) {
          vector.getDataBuffer()
              .setLong(idx * typeWidth, dict.decodeToLong(packedValuesBuffer[packedValuesBufferIdx++]));
          setNotNull(vector, nullabilityHolder, idx);
          idx++;
        }
        break;
    }
    left -= numValues;
    currentCount -= numValues;
  }
}
 
Example #7
Source File: VectorizedDictionaryEncodedParquetValuesReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
void readBatchOfDictionaryEncodedTimestampMillis(
    FieldVector vector, int startOffset, int numValuesToRead,
    Dictionary dict, NullabilityHolder nullabilityHolder, int typeWidth) {
  int left = numValuesToRead;
  int idx = startOffset;
  while (left > 0) {
    if (this.currentCount == 0) {
      this.readNextGroup();
    }
    int numValues = Math.min(left, this.currentCount);
    switch (mode) {
      case RLE:
        for (int i = 0; i < numValues; i++) {
          vector.getDataBuffer().setLong(idx * typeWidth, dict.decodeToLong(currentValue) * 1000);
          setNotNull(vector, nullabilityHolder, idx);
          idx++;
        }
        break;
      case PACKED:
        for (int i = 0; i < numValues; i++) {
          vector.getDataBuffer()
              .setLong(idx * typeWidth, dict.decodeToLong(packedValuesBuffer[packedValuesBufferIdx++]) * 1000);
          setNotNull(vector, nullabilityHolder, idx);
          idx++;
        }
        break;
    }
    left -= numValues;
    currentCount -= numValues;
  }
}
 
Example #8
Source File: VectorizedDictionaryEncodedParquetValuesReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
void readBatchOfDictionaryEncodedIntegers(FieldVector vector, int startOffset, int numValuesToRead, Dictionary dict,
                                          NullabilityHolder nullabilityHolder, int typeWidth) {
  int left = numValuesToRead;
  int idx = startOffset;
  while (left > 0) {
    if (this.currentCount == 0) {
      this.readNextGroup();
    }
    int num = Math.min(left, this.currentCount);
    switch (mode) {
      case RLE:
        for (int i = 0; i < num; i++) {
          vector.getDataBuffer().setInt(idx * typeWidth, dict.decodeToInt(currentValue));
          setNotNull(vector, nullabilityHolder, idx);
          idx++;
        }
        break;
      case PACKED:
        for (int i = 0; i < num; i++) {
          vector.getDataBuffer()
              .setInt(idx * typeWidth, dict.decodeToInt(packedValuesBuffer[packedValuesBufferIdx++]));
          setNotNull(vector, nullabilityHolder, idx);
          idx++;
        }
        break;
    }
    left -= num;
    currentCount -= num;
  }
}
 
Example #9
Source File: VectorizedDictionaryEncodedParquetValuesReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
void readBatchOfDictionaryEncodedFloats(FieldVector vector, int startOffset, int numValuesToRead, Dictionary dict,
                                        NullabilityHolder nullabilityHolder, int typeWidth) {
  int left = numValuesToRead;
  int idx = startOffset;
  while (left > 0) {
    if (this.currentCount == 0) {
      this.readNextGroup();
    }
    int num = Math.min(left, this.currentCount);
    switch (mode) {
      case RLE:
        for (int i = 0; i < num; i++) {
          vector.getDataBuffer().setFloat(idx * typeWidth, dict.decodeToFloat(currentValue));
          setNotNull(vector, nullabilityHolder, idx);
          idx++;
        }
        break;
      case PACKED:
        for (int i = 0; i < num; i++) {
          vector.getDataBuffer()
              .setFloat(idx * typeWidth, dict.decodeToFloat(packedValuesBuffer[packedValuesBufferIdx++]));
          setNotNull(vector, nullabilityHolder, idx);
          idx++;
        }
        break;
    }
    left -= num;
    currentCount -= num;
  }
}
 
Example #10
Source File: VectorizedDictionaryEncodedParquetValuesReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
void readBatchOfDictionaryEncodedDoubles(FieldVector vector, int startOffset, int numValuesToRead, Dictionary dict,
                                         NullabilityHolder nullabilityHolder, int typeWidth) {
  int left = numValuesToRead;
  int idx = startOffset;
  while (left > 0) {
    if (this.currentCount == 0) {
      this.readNextGroup();
    }
    int num = Math.min(left, this.currentCount);
    switch (mode) {
      case RLE:
        for (int i = 0; i < num; i++) {
          vector.getDataBuffer().setDouble(idx * typeWidth, dict.decodeToDouble(currentValue));
          setNotNull(vector, nullabilityHolder, idx);
          idx++;
        }
        break;
      case PACKED:
        for (int i = 0; i < num; i++) {
          vector.getDataBuffer()
              .setDouble(idx * typeWidth, dict.decodeToDouble(packedValuesBuffer[packedValuesBufferIdx++]));
          setNotNull(vector, nullabilityHolder, idx);
          idx++;
        }
        break;
    }
    left -= num;
    currentCount -= num;
  }
}
 
Example #11
Source File: VectorizedDictionaryEncodedParquetValuesReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
void readBatchOfDictionaryEncodedIntLongBackedDecimals(FieldVector vector, int typeWidth, int startOffset,
                                                       int numValuesToRead, Dictionary dict,
                                                       NullabilityHolder nullabilityHolder) {
  int left = numValuesToRead;
  int idx = startOffset;
  while (left > 0) {
    if (this.currentCount == 0) {
      this.readNextGroup();
    }
    int num = Math.min(left, this.currentCount);
    switch (mode) {
      case RLE:
        for (int i = 0; i < num; i++) {
          ((DecimalVector) vector).set(
              idx,
              typeWidth == Integer.BYTES ? dict.decodeToInt(currentValue) : dict.decodeToLong(currentValue));
          nullabilityHolder.setNotNull(idx);
          idx++;
        }
        break;
      case PACKED:
        for (int i = 0; i < num; i++) {
          ((DecimalVector) vector).set(
              idx,
              typeWidth == Integer.BYTES ?
                  dict.decodeToInt(packedValuesBuffer[packedValuesBufferIdx++])
                  : dict.decodeToLong(packedValuesBuffer[packedValuesBufferIdx++]));
          nullabilityHolder.setNotNull(idx);
          idx++;
        }
        break;
    }
    left -= num;
    currentCount -= num;
  }
}
 
Example #12
Source File: ArrowVectorAccessors.java    From iceberg with Apache License 2.0 5 votes vote down vote up
static ArrowVectorAccessor getVectorAccessor(VectorHolder holder) {
  Dictionary dictionary = holder.dictionary();
  boolean isVectorDictEncoded = holder.isDictionaryEncoded();
  ColumnDescriptor desc = holder.descriptor();
  FieldVector vector = holder.vector();
  PrimitiveType primitive = desc.getPrimitiveType();
  if (isVectorDictEncoded) {
    return getDictionaryVectorAccessor(dictionary, desc, vector, primitive);
  } else {
    return getPlainVectorAccessor(vector);
  }
}
 
Example #13
Source File: LocalDictionariesReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static void printDictionary(ColumnDescriptor columnDescriptor, Dictionary localDictionary) {
  System.out.println("Dictionary for column " + columnDescriptor.toString());
  for (int i = 0; i < localDictionary.getMaxId(); ++i) {
    switch (columnDescriptor.getType()) {
      case INT32:
        System.out.println(format("%d: %d", i, localDictionary.decodeToInt(i)));
        break;
      case INT64:
        System.out.println(format("%d: %d", i, localDictionary.decodeToLong(i)));
        break;
      case INT96:
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
        System.out.println(format("%d: %s", i, new String(localDictionary.decodeToBinary(i).getBytesUnsafe())));
        break;
      case FLOAT:
        System.out.println(format("%d: %f", i, localDictionary.decodeToFloat(i)));
        break;
      case DOUBLE:
        System.out.println(format("%d: %f", i, localDictionary.decodeToDouble(i)));
        break;
      case BOOLEAN:
        System.out.println(format("%d: %b", i, localDictionary.decodeToBoolean(i)));
        break;
      default:
        break;
    }
  }
}
 
Example #14
Source File: LocalDictionariesReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
/**
 * Return dictionary per row group for all binary columns in given parquet file.
 * @param fs filesystem object.
 * @param filePath parquet file to scan
 * @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
 * @throws IOException
 */
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CompressionCodecFactory codecFactory) throws IOException {
  // Passing the max footer length is not required in this case as the parquet reader would already have failed.
  final ParquetMetadata parquetMetadata = SingletonParquetFooterCache.readFooter(fs, filePath, ParquetMetadataConverter.NO_FILTER,
    ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  if (parquetMetadata.getBlocks().size() > 1) {
    throw new IOException(
      format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
        parquetMetadata.getBlocks().size(), filePath));
  }
  final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
  final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();

  for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
    columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
  }

  final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
  final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
  try(final FSInputStream in = fs.open(filePath)) {
    for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
      if (isBinaryType(columnChunkMetaData.getType())) {
        final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
        // if first page is dictionary encoded then load dictionary, otherwise skip this column.
        final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
        if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
          dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
        } else {
          columnsToSkip.add(column);
        }
      }
    }
  }
  return new ImmutablePair<>(dictionaries, columnsToSkip);
}
 
Example #15
Source File: ColumnIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) {
    DictionaryPage dictionaryPage = pageSource.readDictionaryPage();
    if (dictionaryPage != null) {
      try {
        return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage);
//        if (converter.hasDictionarySupport()) {
//          converter.setDictionary(dictionary);
//        }
      } catch (IOException e) {
        throw new ParquetDecodingException("could not decode the dictionary for " + desc, e);
      }
    }
    return null;
  }
 
Example #16
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static Map<ColumnDescriptor, List<Dictionary>> readLocalDictionaries(CompressionCodecFactory codecFactory, FileSystem fs, Iterable<FileAttributes> files, BufferAllocator allocator) throws IOException{
  final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // These columns are not dictionary encoded in at least one file.
  final Map<ColumnDescriptor, List<Dictionary>> allDictionaries = Maps.newHashMap();
  for (FileAttributes fileAttributes : files) {
    logger.debug("Scanning file {}", fileAttributes.getPath());
    final Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> localDictionaries = LocalDictionariesReader.readDictionaries(
      fs, fileAttributes.getPath(), codecFactory);

    // Skip columns which are not dictionary encoded
    for (ColumnDescriptor skippedColumn : localDictionaries.getRight()) {
      columnsToSkip.add(skippedColumn);
      allDictionaries.remove(skippedColumn);
    }

    for (final Map.Entry<ColumnDescriptor, Dictionary> entry : localDictionaries.getLeft().entrySet()) {
      if (!columnsToSkip.contains(entry.getKey())) {
        if (allDictionaries.containsKey(entry.getKey())) {
          allDictionaries.get(entry.getKey()).add(entry.getValue());
        } else {
          allDictionaries.put(entry.getKey(), Lists.newArrayList(entry.getValue()));
        }
      }
    }
  }
  logger.debug("Skipping columns {}", columnsToSkip);
  return allDictionaries;
}
 
Example #17
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static VectorContainer buildBinaryGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Binary(), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final VarBinaryVector binaryVector = input.addOrGet(field);
  binaryVector.allocateNew();
  final SortedSet<Binary> values = new TreeSet<>();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToBinary(i));
    }
  }
  if (existingDict != null) {
    final VarBinaryVector existingDictValues = existingDict.getValueAccessorById(VarBinaryVector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(Binary.fromConstantByteArray(existingDictValues.get(i)));
    }
  }
  final Iterator<Binary> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    final byte[] data = iter.next().getBytes();
    binaryVector.setSafe(recordCount++, data, 0, data.length);
  }
  binaryVector.setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
Example #18
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static VectorContainer buildIntegerGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Int(32, true), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final IntVector intVector = input.addOrGet(field);
  intVector.allocateNew();
  final SortedSet<Integer> values = Sets.newTreeSet();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToInt(i));
    }
  }
  if (existingDict != null) {
    final IntVector existingDictValues = existingDict.getValueAccessorById(IntVector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(existingDictValues.get(i));
    }
  }
  final Iterator<Integer> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    intVector.setSafe(recordCount++, iter.next());
  }
  intVector.setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
Example #19
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static VectorContainer buildLongGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Int(64, true), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final BigIntVector longVector = input.addOrGet(field);
  longVector.allocateNew();
  SortedSet<Long> values = Sets.newTreeSet();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToLong(i));
    }
  }
  if (existingDict != null) {
    final BigIntVector existingDictValues = existingDict.getValueAccessorById(BigIntVector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(existingDictValues.get(i));
    }
  }
  final Iterator<Long> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    longVector.setSafe(recordCount++, iter.next());
  }
  longVector.setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
Example #20
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static VectorContainer buildDoubleGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final Float8Vector doubleVector = input.addOrGet(field);
  doubleVector.allocateNew();
  SortedSet<Double> values = Sets.newTreeSet();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToDouble(i));
    }
  }
  if (existingDict != null) {
    final Float8Vector existingDictValues = existingDict.getValueAccessorById(Float8Vector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(existingDictValues.get(i));
    }
  }
  final Iterator<Double> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    doubleVector.setSafe(recordCount++, iter.next());
  }
  doubleVector.setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
Example #21
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static VectorContainer buildFloatGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final Float4Vector floatVector = input.addOrGet(field);
  floatVector.allocateNew();
  SortedSet<Float> values = Sets.newTreeSet();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToFloat(i));
    }
  }
  if (existingDict != null) {
    final Float4Vector existingDictValues = existingDict.getValueAccessorById(Float4Vector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(existingDictValues.get(i));
    }
  }
  final Iterator<Float> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    floatVector.setSafe(recordCount++, iter.next());
  }
  floatVector.setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
Example #22
Source File: ArrowVectorAccessors.java    From iceberg with Apache License 2.0 5 votes vote down vote up
DictionaryLongAccessor(IntVector vector, Dictionary dictionary) {
  super(vector);
  this.offsetVector = vector;
  this.decodedDictionary = IntStream.rangeClosed(0, dictionary.getMaxId())
      .mapToLong(dictionary::decodeToLong)
      .toArray();
}
 
Example #23
Source File: ArrowVectorAccessors.java    From iceberg with Apache License 2.0 5 votes vote down vote up
DictionaryFloatAccessor(IntVector vector, Dictionary dictionary) {
  super(vector);
  this.offsetVector = vector;
  this.decodedDictionary = new float[dictionary.getMaxId() + 1];
  for (int i = 0; i <= dictionary.getMaxId(); i++) {
    decodedDictionary[i] = dictionary.decodeToFloat(i);
  }
}
 
Example #24
Source File: TestDictionary.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private DictionaryValuesReader initDicReader(ValuesWriter cw, PrimitiveTypeName type)
    throws IOException {
  final DictionaryPage dictionaryPage = cw.toDictPageAndClose().copy();
  final ColumnDescriptor descriptor = new ColumnDescriptor(new String[] {"foo"}, type, 0, 0);
  final Dictionary dictionary = PLAIN.initDictionary(descriptor, dictionaryPage);
  final DictionaryValuesReader cr = new DictionaryValuesReader(dictionary);
  return cr;
}
 
Example #25
Source File: ArrowVectorAccessors.java    From iceberg with Apache License 2.0 5 votes vote down vote up
DictionaryDoubleAccessor(IntVector vector, Dictionary dictionary) {
  super(vector);
  this.offsetVector = vector;
  this.decodedDictionary = IntStream.rangeClosed(0, dictionary.getMaxId())
      .mapToDouble(dictionary::decodeToDouble)
      .toArray();
}
 
Example #26
Source File: VectorizedColumnIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public Dictionary setRowGroupInfo(PageReader store, boolean allPagesDictEncoded) {
  // setPageSource can result in a data page read. If that happens, we need
  // to know in advance whether all the pages in the row group are dictionary encoded or not
  this.vectorizedPageIterator.setAllPagesDictEncoded(allPagesDictEncoded);
  super.setPageSource(store);
  return dictionary;
}
 
Example #27
Source File: ArrowVectorAccessors.java    From iceberg with Apache License 2.0 5 votes vote down vote up
DictionaryStringAccessor(IntVector vector, Dictionary dictionary) {
  super(vector);
  this.offsetVector = vector;
  this.decodedDictionary = IntStream.rangeClosed(0, dictionary.getMaxId())
      .mapToObj(dictionary::decodeToBinary)
      .map(binary -> UTF8String.fromBytes(binary.getBytes()))
      .toArray(UTF8String[]::new);
}
 
Example #28
Source File: VectorHolder.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public VectorHolder(
    ColumnDescriptor columnDescriptor, FieldVector vector, boolean isDictionaryEncoded,
    Dictionary dictionary, NullabilityHolder holder, Type type) {
  // All the fields except dictionary are not nullable unless it is a dummy holder
  Preconditions.checkNotNull(columnDescriptor, "ColumnDescriptor cannot be null");
  Preconditions.checkNotNull(vector, "Vector cannot be null");
  Preconditions.checkNotNull(holder, "NullabilityHolder cannot be null");
  Preconditions.checkNotNull(type, "IcebergType cannot be null");
  this.columnDescriptor = columnDescriptor;
  this.vector = vector;
  this.isDictionaryEncoded = isDictionaryEncoded;
  this.dictionary = dictionary;
  this.nullabilityHolder = holder;
  this.icebergType = type;
}
 
Example #29
Source File: ProtoMessageConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void setDictionary(Dictionary dictionary) {
  dict = new  Descriptors.EnumValueDescriptor[dictionary.getMaxId() + 1];
  for (int i = 0; i <= dictionary.getMaxId(); i++) {
    Binary binaryValue = dictionary.decodeToBinary(i);
    dict[i] = translateEnumValue(binaryValue);
  }
}
 
Example #30
Source File: TupleConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void setDictionary(Dictionary dictionary) {
  dict = new String[dictionary.getMaxId() + 1];
  for (int i = 0; i <= dictionary.getMaxId(); i++) {
    dict[i] = dictionary.decodeToBinary(i).toStringUsingUTF8();
  }
}