Java Code Examples for org.apache.parquet.hadoop.ParquetFileReader#getNextDictionaryReader()

The following examples show how to use org.apache.parquet.hadoop.ParquetFileReader#getNextDictionaryReader() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: TestDictionaryRowGroupFilter.java From iceberg with Apache License 2.0

5 votes

@BeforeClass
public static void createInputFile() throws IOException {
  if (PARQUET_FILE.exists()) {
    Assert.assertTrue(PARQUET_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(PARQUET_FILE);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .build()) {
    GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table"));
    // create 20 copies of each record to ensure dictionary-encoding
    for (int copy = 0; copy < 20; copy += 1) {
      // create 50 records
      for (int i = 0; i < 50; i += 1) {
        builder.set("_id", 30 + i); // min=30, max=79, num-nulls=0
        builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats
        builder.set("_required", "req"); // required, always non-null
        builder.set("_all_nulls", null); // never non-null
        builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
        builder.set("_no_nulls", ""); // optional, but always non-null
        builder.set("_non_dict", UUID.randomUUID().toString()); // not dictionary-encoded
        appender.add(builder.build());
      }
    }
  }

  InputFile inFile = Files.localInput(PARQUET_FILE);

  ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inFile));

  Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
  ROW_GROUP_METADATA = reader.getRowGroups().get(0);
  PARQUET_SCHEMA = reader.getFileMetaData().getSchema();
  DICTIONARY_STORE = reader.getNextDictionaryReader();

  PARQUET_FILE.deleteOnExit();
}

Example 2

Source File: TestDictionaryRowGroupFilter.java From iceberg with Apache License 2.0

4 votes

@BeforeClass
public static void createInputFile() throws IOException {
  if (PARQUET_FILE.exists()) {
    Assert.assertTrue(PARQUET_FILE.delete());
  }

  // build struct field schema
  org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(_structFieldType);

  OutputFile outFile = Files.localOutput(PARQUET_FILE);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .build()) {
    GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table"));
    // create 20 copies of each record to ensure dictionary-encoding
    for (int copy = 0; copy < 20; copy += 1) {
      // create 50 records
      for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) {
        builder.set("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0
        builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats
        builder.set("_required", "req"); // required, always non-null
        builder.set("_all_nulls", null); // never non-null
        builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values
        builder.set("_no_nulls", ""); // optional, but always non-null
        builder.set("_non_dict", UUID.randomUUID().toString()); // not dictionary-encoded

        Record structNotNull = new Record(structSchema);
        structNotNull.put("_int_field", INT_MIN_VALUE + i);
        builder.set("_struct_not_null", structNotNull); // struct with int

        appender.add(builder.build());
      }
    }
  }

  InputFile inFile = Files.localInput(PARQUET_FILE);

  ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inFile));

  Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
  rowGroupMetadata = reader.getRowGroups().get(0);
  parquetSchema = reader.getFileMetaData().getSchema();
  dictionaryStore = reader.getNextDictionaryReader();

  PARQUET_FILE.deleteOnExit();
}

Example 3

Source File: ShowDictionaryCommand.java From parquet-mr with Apache License 2.0

4 votes

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() >= 1,
      "A Parquet file is required.");
  Preconditions.checkArgument(targets.size() == 1,
      "Cannot process multiple Parquet files.");

  String source = targets.get(0);

  ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
  MessageType schema = reader.getFileMetaData().getSchema();
  ColumnDescriptor descriptor = Util.descriptor(column, schema);
  PrimitiveType type = Util.primitive(column, schema);
  Preconditions.checkNotNull(type);

  DictionaryPageReadStore dictionaryReader;
  int rowGroup = 0;
  while ((dictionaryReader = reader.getNextDictionaryReader()) != null) {
    DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor);

    Dictionary dict = page.getEncoding().initDictionary(descriptor, page);

    console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column, page.getCompressedSize());
    for (int i = 0; i <= dict.getMaxId(); i += 1) {
      switch(type.getPrimitiveTypeName()) {
        case BINARY:
          if (type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation) {
            console.info("{}: {}", String.format("%6d", i),
                Util.humanReadable(dict.decodeToBinary(i).toStringUsingUTF8(), 70));
          } else {
            console.info("{}: {}", String.format("%6d", i),
                Util.humanReadable(dict.decodeToBinary(i).getBytesUnsafe(), 70));
          }
          break;
        case INT32:
          console.info("{}: {}", String.format("%6d", i),
            dict.decodeToInt(i));
          break;
        case INT64:
          console.info("{}: {}", String.format("%6d", i),
              dict.decodeToLong(i));
          break;
        case FLOAT:
          console.info("{}: {}", String.format("%6d", i),
              dict.decodeToFloat(i));
          break;
        case DOUBLE:
          console.info("{}: {}", String.format("%6d", i),
              dict.decodeToDouble(i));
          break;
        default:
          throw new IllegalArgumentException(
              "Unknown dictionary type: " + type.getPrimitiveTypeName());
      }
    }

    reader.skipNextRowGroup();

    rowGroup += 1;
  }

  console.info("");

  return 0;
}