Java Code Examples for org.apache.parquet.hadoop.ParquetFileReader#getNextDictionaryReader()
The following examples show how to use
org.apache.parquet.hadoop.ParquetFileReader#getNextDictionaryReader() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestDictionaryRowGroupFilter.java From iceberg with Apache License 2.0 | 5 votes |
@BeforeClass public static void createInputFile() throws IOException { if (PARQUET_FILE.exists()) { Assert.assertTrue(PARQUET_FILE.delete()); } OutputFile outFile = Files.localOutput(PARQUET_FILE); try (FileAppender<Record> appender = Parquet.write(outFile) .schema(FILE_SCHEMA) .build()) { GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table")); // create 20 copies of each record to ensure dictionary-encoding for (int copy = 0; copy < 20; copy += 1) { // create 50 records for (int i = 0; i < 50; i += 1) { builder.set("_id", 30 + i); // min=30, max=79, num-nulls=0 builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats builder.set("_required", "req"); // required, always non-null builder.set("_all_nulls", null); // never non-null builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values builder.set("_no_nulls", ""); // optional, but always non-null builder.set("_non_dict", UUID.randomUUID().toString()); // not dictionary-encoded appender.add(builder.build()); } } } InputFile inFile = Files.localInput(PARQUET_FILE); ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inFile)); Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size()); ROW_GROUP_METADATA = reader.getRowGroups().get(0); PARQUET_SCHEMA = reader.getFileMetaData().getSchema(); DICTIONARY_STORE = reader.getNextDictionaryReader(); PARQUET_FILE.deleteOnExit(); }
Example 2
Source File: TestDictionaryRowGroupFilter.java From iceberg with Apache License 2.0 | 4 votes |
@BeforeClass public static void createInputFile() throws IOException { if (PARQUET_FILE.exists()) { Assert.assertTrue(PARQUET_FILE.delete()); } // build struct field schema org.apache.avro.Schema structSchema = AvroSchemaUtil.convert(_structFieldType); OutputFile outFile = Files.localOutput(PARQUET_FILE); try (FileAppender<Record> appender = Parquet.write(outFile) .schema(FILE_SCHEMA) .build()) { GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table")); // create 20 copies of each record to ensure dictionary-encoding for (int copy = 0; copy < 20; copy += 1) { // create 50 records for (int i = 0; i < INT_MAX_VALUE - INT_MIN_VALUE + 1; i += 1) { builder.set("_id", INT_MIN_VALUE + i); // min=30, max=79, num-nulls=0 builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats builder.set("_required", "req"); // required, always non-null builder.set("_all_nulls", null); // never non-null builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values builder.set("_no_nulls", ""); // optional, but always non-null builder.set("_non_dict", UUID.randomUUID().toString()); // not dictionary-encoded Record structNotNull = new Record(structSchema); structNotNull.put("_int_field", INT_MIN_VALUE + i); builder.set("_struct_not_null", structNotNull); // struct with int appender.add(builder.build()); } } } InputFile inFile = Files.localInput(PARQUET_FILE); ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inFile)); Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size()); rowGroupMetadata = reader.getRowGroups().get(0); parquetSchema = reader.getFileMetaData().getSchema(); dictionaryStore = reader.getNextDictionaryReader(); PARQUET_FILE.deleteOnExit(); }
Example 3
Source File: ShowDictionaryCommand.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override @SuppressWarnings("unchecked") public int run() throws IOException { Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required."); Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files."); String source = targets.get(0); ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source)); MessageType schema = reader.getFileMetaData().getSchema(); ColumnDescriptor descriptor = Util.descriptor(column, schema); PrimitiveType type = Util.primitive(column, schema); Preconditions.checkNotNull(type); DictionaryPageReadStore dictionaryReader; int rowGroup = 0; while ((dictionaryReader = reader.getNextDictionaryReader()) != null) { DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor); Dictionary dict = page.getEncoding().initDictionary(descriptor, page); console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column, page.getCompressedSize()); for (int i = 0; i <= dict.getMaxId(); i += 1) { switch(type.getPrimitiveTypeName()) { case BINARY: if (type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.StringLogicalTypeAnnotation) { console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).toStringUsingUTF8(), 70)); } else { console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).getBytesUnsafe(), 70)); } break; case INT32: console.info("{}: {}", String.format("%6d", i), dict.decodeToInt(i)); break; case INT64: console.info("{}: {}", String.format("%6d", i), dict.decodeToLong(i)); break; case FLOAT: console.info("{}: {}", String.format("%6d", i), dict.decodeToFloat(i)); break; case DOUBLE: console.info("{}: {}", String.format("%6d", i), dict.decodeToDouble(i)); break; default: throw new IllegalArgumentException( "Unknown dictionary type: " + type.getPrimitiveTypeName()); } } reader.skipNextRowGroup(); rowGroup += 1; } console.info(""); return 0; }