Java Code Examples for org.apache.parquet.column.Dictionary#getMaxId()

The following examples show how to use org.apache.parquet.column.Dictionary#getMaxId() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ArrowVectorAccessors.java    From iceberg with Apache License 2.0 5 votes vote down vote up
DictionaryFloatAccessor(IntVector vector, Dictionary dictionary) {
  super(vector);
  this.offsetVector = vector;
  this.decodedDictionary = new float[dictionary.getMaxId() + 1];
  for (int i = 0; i <= dictionary.getMaxId(); i++) {
    decodedDictionary[i] = dictionary.decodeToFloat(i);
  }
}
 
Example 2
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static VectorContainer buildIntegerGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Int(32, true), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final IntVector intVector = input.addOrGet(field);
  intVector.allocateNew();
  final SortedSet<Integer> values = Sets.newTreeSet();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToInt(i));
    }
  }
  if (existingDict != null) {
    final IntVector existingDictValues = existingDict.getValueAccessorById(IntVector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(existingDictValues.get(i));
    }
  }
  final Iterator<Integer> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    intVector.setSafe(recordCount++, iter.next());
  }
  intVector.setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
Example 3
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static VectorContainer buildLongGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Int(64, true), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final BigIntVector longVector = input.addOrGet(field);
  longVector.allocateNew();
  SortedSet<Long> values = Sets.newTreeSet();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToLong(i));
    }
  }
  if (existingDict != null) {
    final BigIntVector existingDictValues = existingDict.getValueAccessorById(BigIntVector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(existingDictValues.get(i));
    }
  }
  final Iterator<Long> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    longVector.setSafe(recordCount++, iter.next());
  }
  longVector.setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
Example 4
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static VectorContainer buildDoubleGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final Float8Vector doubleVector = input.addOrGet(field);
  doubleVector.allocateNew();
  SortedSet<Double> values = Sets.newTreeSet();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToDouble(i));
    }
  }
  if (existingDict != null) {
    final Float8Vector existingDictValues = existingDict.getValueAccessorById(Float8Vector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(existingDictValues.get(i));
    }
  }
  final Iterator<Double> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    doubleVector.setSafe(recordCount++, iter.next());
  }
  doubleVector.setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
Example 5
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static VectorContainer buildFloatGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final Float4Vector floatVector = input.addOrGet(field);
  floatVector.allocateNew();
  SortedSet<Float> values = Sets.newTreeSet();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToFloat(i));
    }
  }
  if (existingDict != null) {
    final Float4Vector existingDictValues = existingDict.getValueAccessorById(Float4Vector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(existingDictValues.get(i));
    }
  }
  final Iterator<Float> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    floatVector.setSafe(recordCount++, iter.next());
  }
  floatVector.setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
Example 6
Source File: GlobalDictionaryBuilder.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private static VectorContainer buildBinaryGlobalDictionary(List<Dictionary> dictionaries, VectorContainer existingDict, ColumnDescriptor columnDescriptor, BufferAllocator bufferAllocator) {
  final Field field = new Field(SchemaPath.getCompoundPath(columnDescriptor.getPath()).getAsUnescapedPath(), true, new ArrowType.Binary(), null);
  final VectorContainer input = new VectorContainer(bufferAllocator);
  final VarBinaryVector binaryVector = input.addOrGet(field);
  binaryVector.allocateNew();
  final SortedSet<Binary> values = new TreeSet<>();
  for (Dictionary dictionary : dictionaries) {
    for (int i = 0; i <= dictionary.getMaxId(); ++i) {
      values.add(dictionary.decodeToBinary(i));
    }
  }
  if (existingDict != null) {
    final VarBinaryVector existingDictValues = existingDict.getValueAccessorById(VarBinaryVector.class, 0).getValueVector();
    for (int i = 0; i < existingDict.getRecordCount(); ++i) {
      values.add(Binary.fromConstantByteArray(existingDictValues.get(i)));
    }
  }
  final Iterator<Binary> iter = values.iterator();
  int recordCount = 0;
  while (iter.hasNext()) {
    final byte[] data = iter.next().getBytes();
    binaryVector.setSafe(recordCount++, data, 0, data.length);
  }
  binaryVector.setValueCount(recordCount);
  input.setRecordCount(recordCount);
  input.buildSchema(BatchSchema.SelectionVectorMode.NONE);
  return input;
}
 
Example 7
Source File: LocalDictionariesReader.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
public static void printDictionary(ColumnDescriptor columnDescriptor, Dictionary localDictionary) {
  System.out.println("Dictionary for column " + columnDescriptor.toString());
  for (int i = 0; i < localDictionary.getMaxId(); ++i) {
    switch (columnDescriptor.getType()) {
      case INT32:
        System.out.println(format("%d: %d", i, localDictionary.decodeToInt(i)));
        break;
      case INT64:
        System.out.println(format("%d: %d", i, localDictionary.decodeToLong(i)));
        break;
      case INT96:
      case BINARY:
      case FIXED_LEN_BYTE_ARRAY:
        System.out.println(format("%d: %s", i, new String(localDictionary.decodeToBinary(i).getBytesUnsafe())));
        break;
      case FLOAT:
        System.out.println(format("%d: %f", i, localDictionary.decodeToFloat(i)));
        break;
      case DOUBLE:
        System.out.println(format("%d: %f", i, localDictionary.decodeToDouble(i)));
        break;
      case BOOLEAN:
        System.out.println(format("%d: %b", i, localDictionary.decodeToBoolean(i)));
        break;
      default:
        break;
    }
  }
}
 
Example 8
Source File: ProtoMessageConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void setDictionary(Dictionary dictionary) {
  dict = new  Descriptors.EnumValueDescriptor[dictionary.getMaxId() + 1];
  for (int i = 0; i <= dictionary.getMaxId(); i++) {
    Binary binaryValue = dictionary.decodeToBinary(i);
    dict[i] = translateEnumValue(binaryValue);
  }
}
 
Example 9
Source File: TupleConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public void setDictionary(Dictionary dictionary) {
  dict = new String[dictionary.getMaxId() + 1];
  for (int i = 0; i <= dictionary.getMaxId(); i++) {
    dict[i] = dictionary.decodeToBinary(i).toStringUsingUTF8();
  }
}
 
Example 10
Source File: AvroConverters.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public void setDictionary(Dictionary dictionary) {
  dict = (T[]) new Object[dictionary.getMaxId() + 1];
  for (int i = 0; i <= dictionary.getMaxId(); i++) {
    dict[i] = convert(dictionary.decodeToBinary(i));
  }
}
 
Example 11
Source File: ArrowVectorAccessors.java    From iceberg with Apache License 2.0 4 votes vote down vote up
private DictionaryDecimalAccessor(IntVector vector, Dictionary dictionary) {
  super(vector);
  this.offsetVector = vector;
  this.parquetDictionary = dictionary;
  this.cache = new Decimal[dictionary.getMaxId() + 1];
}
 
Example 12
Source File: DictionaryFilter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("unchecked")
private <T extends Comparable<T>> Set<T> expandDictionary(ColumnChunkMetaData meta) throws IOException {
  ColumnDescriptor col = new ColumnDescriptor(meta.getPath().toArray(), meta.getPrimitiveType(), -1, -1);
  DictionaryPage page = dictionaries.readDictionaryPage(col);

  // the chunk may not be dictionary-encoded
  if (page == null) {
    return null;
  }

  Dictionary dict = page.getEncoding().initDictionary(col, page);

  IntFunction<Object> dictValueProvider;
  PrimitiveTypeName type = meta.getPrimitiveType().getPrimitiveTypeName();
  switch (type) {
  case FIXED_LEN_BYTE_ARRAY: // Same as BINARY
  case BINARY:
    dictValueProvider = dict::decodeToBinary;
    break;
  case INT32:
    dictValueProvider = dict::decodeToInt;
    break;
  case INT64:
    dictValueProvider = dict::decodeToLong;
    break;
  case FLOAT:
    dictValueProvider = dict::decodeToFloat;
    break;
  case DOUBLE:
    dictValueProvider = dict::decodeToDouble;
    break;
  default:
    LOG.warn("Unsupported dictionary type: {}", type);
    return null;
  }

  Set<T> dictSet = new HashSet<>();
  for (int i = 0; i <= dict.getMaxId(); i++) {
    dictSet.add((T) dictValueProvider.apply(i));
  }
  
  return dictSet;
}
 
Example 13
Source File: ShowDictionaryCommand.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() >= 1,
      "A Parquet file is required.");
  Preconditions.checkArgument(targets.size() == 1,
      "Cannot process multiple Parquet files.");

  String source = targets.get(0);

  ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
  MessageType schema = reader.getFileMetaData().getSchema();
  ColumnDescriptor descriptor = Util.descriptor(column, schema);
  PrimitiveType type = Util.primitive(column, schema);
  Preconditions.checkNotNull(type);

  DictionaryPageReadStore dictionaryReader;
  int rowGroup = 0;
  while ((dictionaryReader = reader.getNextDictionaryReader()) != null) {
    DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor);

    Dictionary dict = page.getEncoding().initDictionary(descriptor, page);

    console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column, page.getCompressedSize());
    for (int i = 0; i <= dict.getMaxId(); i += 1) {
      switch(type.getPrimitiveTypeName()) {
        case BINARY:
          if (type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation) {
            console.info("{}: {}", String.format("%6d", i),
                Util.humanReadable(dict.decodeToBinary(i).toStringUsingUTF8(), 70));
          } else {
            console.info("{}: {}", String.format("%6d", i),
                Util.humanReadable(dict.decodeToBinary(i).getBytesUnsafe(), 70));
          }
          break;
        case INT32:
          console.info("{}: {}", String.format("%6d", i),
            dict.decodeToInt(i));
          break;
        case INT64:
          console.info("{}: {}", String.format("%6d", i),
              dict.decodeToLong(i));
          break;
        case FLOAT:
          console.info("{}: {}", String.format("%6d", i),
              dict.decodeToFloat(i));
          break;
        case DOUBLE:
          console.info("{}: {}", String.format("%6d", i),
              dict.decodeToDouble(i));
          break;
        default:
          throw new IllegalArgumentException(
              "Unknown dictionary type: " + type.getPrimitiveTypeName());
      }
    }

    reader.skipNextRowGroup();

    rowGroup += 1;
  }

  console.info("");

  return 0;
}