Java Code Examples for org.apache.parquet.schema.MessageType

The following examples show how to use org.apache.parquet.schema.MessageType. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: parquet-mr   Source File: TestParquetParser.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testEmbeddedAnnotations() {
  String message = "message EmbeddedMessage {" +
      "  required binary json (JSON);" +
      "  required binary bson (BSON);" +
      "}\n";

  MessageType parsed = MessageTypeParser.parseMessageType(message);
  MessageType expected = Types.buildMessage()
      .required(BINARY).as(JSON).named("json")
      .required(BINARY).as(BSON).named("bson")
      .named("EmbeddedMessage");

  assertEquals(expected, parsed);
  MessageType reparsed = MessageTypeParser.parseMessageType(parsed.toString());
  assertEquals(expected, reparsed);
}
 
Example 2
Source Project: pxf   Source File: ParquetResolverTest.java    License: Apache License 2.0 6 votes vote down vote up
private MessageType getParquetSchemaForPrimitiveTypes(Type.Repetition repetition, boolean readCase) {
    List<Type> fields = new ArrayList<>();

    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "s1", OriginalType.UTF8));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "s2", OriginalType.UTF8));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT32, "n1", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.DOUBLE, "d1", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, 16, "dc1", OriginalType.DECIMAL, new DecimalMetadata(38, 18), null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT96, "tm", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.FLOAT, "f", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT64, "bg", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BOOLEAN, "b", null));

    // GPDB only has int16 and not int8 type, so for write tiny numbers int8 are still treated as shorts in16
    OriginalType tinyType = readCase ? OriginalType.INT_8 : OriginalType.INT_16;
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT32, "tn", tinyType));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT32, "sml", OriginalType.INT_16));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "vc1", OriginalType.UTF8));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "c1", OriginalType.UTF8));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "bin", null));

    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT96, "tmtz", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT96, "tmtz2", null));

    return new MessageType("hive_schema", fields);
}
 
Example 3
Source Project: parquet-mr   Source File: ParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * FOR TESTING ONLY. This supports testing block padding behavior on the local FS.
 *
 * @param configuration Hadoop configuration
 * @param schema the schema of the data
 * @param file the file to write to
 * @param rowAndBlockSize the row group size
 * @param maxPaddingSize the maximum padding
 * @throws IOException if the file can not be created
 */
ParquetFileWriter(Configuration configuration, MessageType schema,
                  Path file, long rowAndBlockSize, int maxPaddingSize)
    throws IOException {
  FileSystem fs = file.getFileSystem(configuration);
  this.schema = schema;
  this.alignment = PaddingAlignment.get(
      rowAndBlockSize, rowAndBlockSize, maxPaddingSize);
  this.out = HadoopStreams.wrap(
      fs.create(file, true, 8192, fs.getDefaultReplication(file), rowAndBlockSize));
  this.encodingStatsBuilder = new EncodingStats.Builder();
  // no truncation is needed for testing
  this.columnIndexTruncateLength = Integer.MAX_VALUE;
  this.pageWriteChecksumEnabled = ParquetOutputFormat.getPageWriteChecksumEnabled(configuration);
  this.crc = pageWriteChecksumEnabled ? new CRC32() : null;
  this.metadataConverter = new ParquetMetadataConverter(ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
}
 
Example 4
Source Project: parquet-mr   Source File: TestParquetFileWriter.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * {@link ParquetFileWriter#mergeFooters(Path, List)} expects a fully-qualified
 * path for the root and crashes if a relative one is provided.
 */
@Test
public void testWriteMetadataFileWithRelativeOutputPath() throws IOException {
  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.get(conf);
  Path relativeRoot = new Path("target/_test_relative");
  Path qualifiedRoot = fs.makeQualified(relativeRoot);

  ParquetMetadata mock = Mockito.mock(ParquetMetadata.class);
  FileMetaData fileMetaData = new FileMetaData(
          new MessageType("root1",
              new PrimitiveType(REPEATED, BINARY, "a")),
          new HashMap<String, String>(), "test");
  Mockito.when(mock.getFileMetaData()).thenReturn(fileMetaData);

  List<Footer> footers = new ArrayList<Footer>();
  Footer footer = new Footer(new Path(qualifiedRoot, "one"), mock);
  footers.add(footer);

  // This should not throw an exception
  ParquetFileWriter.writeMetadataFile(conf, relativeRoot, footers, JobSummaryLevel.ALL);
}
 
Example 5
Source Project: parquet-mr   Source File: DumpCommand.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
    super.execute(options);

    String[] args = options.getArgs();
    String input = args[0];

    Configuration conf = new Configuration();
    Path inpath = new Path(input);

    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath, NO_FILTER);
    MessageType schema = metaData.getFileMetaData().getSchema();

    boolean showmd = !options.hasOption('m');
    boolean showdt = !options.hasOption('d');
    boolean cropoutput = !options.hasOption('n');

    Set<String> showColumns = null;
    if (options.hasOption('c')) {
        String[] cols = options.getOptionValues('c');
        showColumns = new HashSet<String>(Arrays.asList(cols));
    }

    PrettyPrintWriter out = prettyPrintWriter(cropoutput);
    dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}
 
Example 6
Source Project: parquet-mr   Source File: TestColumnIO.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testReadUsingProjectedSchema(){
  MessageType orginalSchema = new MessageType("schema",
          new PrimitiveType(REQUIRED, INT32, "a"),
          new PrimitiveType(REQUIRED, INT32, "b")
  );
  MessageType projectedSchema = new MessageType("schema",
          new PrimitiveType(OPTIONAL, INT32, "b")
  );
  MemPageStore store = new MemPageStore(1);
  SimpleGroupFactory groupFactory = new SimpleGroupFactory(orginalSchema);
  writeGroups(orginalSchema, store, groupFactory.newGroup().append("a", 1).append("b", 2));

  {
    List<Group> groups = new ArrayList<>();
    groups.addAll(readGroups(store, orginalSchema, projectedSchema, 1));
    Object[][] expected = {
            {2},
    };
    validateGroups(groups, expected);
  }
}
 
Example 7
Source Project: parquet-mr   Source File: TestParquetParser.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testIDs() {
  String message =
      "message Message {\n" +
      "  required binary string (UTF8) = 6;\n" +
      "  required int32 i=1;\n" +
      "  required binary s2= 3;\n" +
      "  required binary s3 =4;\n" +
      "}\n";

  MessageType parsed = parseMessageType(message);
  MessageType expected = buildMessage()
      .required(BINARY).as(OriginalType.UTF8).id(6).named("string")
      .required(INT32).id(1).named("i")
      .required(BINARY).id(3).named("s2")
      .required(BINARY).id(4).named("s3")
      .named("Message");

  assertEquals(expected, parsed);
  MessageType reparsed = parseMessageType(parsed.toString());
  assertEquals(expected, reparsed);
}
 
Example 8
Source Project: flink   Source File: ParquetRowInputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testReadRowFromSimpleRecord() throws IOException {
	Tuple3<Class<? extends SpecificRecord>, SpecificRecord, Row> simple = TestUtil.getSimpleRecordTestData();
	Path path = TestUtil.createTempParquetFile(
		tempRoot.getRoot(), TestUtil.SIMPLE_SCHEMA, Arrays.asList(simple.f1, simple.f1));
	MessageType simpleType = SCHEMA_CONVERTER.convert(TestUtil.SIMPLE_SCHEMA);

	ParquetRowInputFormat inputFormat = new ParquetRowInputFormat(path, simpleType);
	inputFormat.setRuntimeContext(TestUtil.getMockRuntimeContext());

	FileInputSplit[] splits = inputFormat.createInputSplits(1);
	assertEquals(1, splits.length);
	inputFormat.open(splits[0]);

	Row row = inputFormat.nextRecord(null);
	assertNotNull(row);
	assertEquals(simple.f2, row);

	row = inputFormat.nextRecord(null);
	assertNotNull(row);
	assertEquals(simple.f2, row);

	assertTrue(inputFormat.reachedEnd());
}
 
Example 9
Source Project: iceberg   Source File: ParquetMetricsRowGroupFilter.java    License: Apache License 2.0 6 votes vote down vote up
private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) {
  if (rowGroup.getRowCount() <= 0) {
    return ROWS_CANNOT_MATCH;
  }

  this.stats = Maps.newHashMap();
  this.valueCounts = Maps.newHashMap();
  this.conversions = Maps.newHashMap();
  for (ColumnChunkMetaData col : rowGroup.getColumns()) {
    PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType();
    if (colType.getId() != null) {
      int id = colType.getId().intValue();
      stats.put(id, col.getStatistics());
      valueCounts.put(id, col.getValueCount());
      conversions.put(id, ParquetConversions.converterFromParquet(colType));
    }
  }

  return ExpressionVisitors.visitEvaluator(expr, this);
}
 
Example 10
Source Project: flink   Source File: ParquetMapInputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
@SuppressWarnings("unchecked")
public void testProjectedReadMapFromNestedRecord() throws IOException {
	Tuple3<Class<? extends SpecificRecord>, SpecificRecord, Row> nested = TestUtil.getNestedRecordTestData();
	Path path = TestUtil.createTempParquetFile(tempRoot.getRoot(), TestUtil.NESTED_SCHEMA, Collections.singletonList(nested.f1));
	MessageType nestedType = SCHEMA_CONVERTER.convert(TestUtil.NESTED_SCHEMA);
	ParquetMapInputFormat inputFormat = new ParquetMapInputFormat(path, nestedType);

	inputFormat.selectFields(Collections.singletonList("nestedMap").toArray(new String[0]));
	inputFormat.setRuntimeContext(TestUtil.getMockRuntimeContext());

	FileInputSplit[] splits = inputFormat.createInputSplits(1);
	assertEquals(1, splits.length);
	inputFormat.open(splits[0]);

	Map map = inputFormat.nextRecord(null);
	assertNotNull(map);
	assertEquals(1, map.size());

	Map<String, String> mapItem = (Map<String, String>) ((Map) map.get("nestedMap")).get("mapItem");
	assertEquals(2, mapItem.size());
	assertEquals("map", mapItem.get("type"));
	assertEquals("hashMap", mapItem.get("value"));
}
 
Example 11
Source Project: flink   Source File: ParquetMapInputFormatTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
@SuppressWarnings("unchecked")
public void testProjectedReadMapFromNestedRecord() throws IOException {
	Tuple3<Class<? extends SpecificRecord>, SpecificRecord, Row> nested = TestUtil.getNestedRecordTestData();
	Path path = TestUtil.createTempParquetFile(tempRoot.getRoot(), TestUtil.NESTED_SCHEMA, Collections.singletonList(nested.f1));
	MessageType nestedType = SCHEMA_CONVERTER.convert(TestUtil.NESTED_SCHEMA);
	ParquetMapInputFormat inputFormat = new ParquetMapInputFormat(path, nestedType);

	inputFormat.selectFields(Collections.singletonList("nestedMap").toArray(new String[0]));
	inputFormat.setRuntimeContext(TestUtil.getMockRuntimeContext());

	FileInputSplit[] splits = inputFormat.createInputSplits(1);
	assertEquals(1, splits.length);
	inputFormat.open(splits[0]);

	Map map = inputFormat.nextRecord(null);
	assertNotNull(map);
	assertEquals(1, map.size());

	Map<String, String> mapItem = (Map<String, String>) ((Map) map.get("nestedMap")).get("mapItem");
	assertEquals(2, mapItem.size());
	assertEquals("map", mapItem.get("type"));
	assertEquals("hashMap", mapItem.get("value"));
}
 
Example 12
Source Project: parquet-mr   Source File: ParquetRecordWriter.java    License: Apache License 2.0 6 votes vote down vote up
/**
 *
 * @param w the file to write to
 * @param writeSupport the class to convert incoming records
 * @param schema the schema of the records
 * @param extraMetaData extra meta data to write in the footer of the file
 * @param blockSize the size of a block in the file (this will be approximate)
 * @param codec the compression codec used to compress the pages
 * @param validating if schema validation should be turned on
 * @param props parquet encoding properties
 */
ParquetRecordWriter(
    ParquetFileWriter w,
    WriteSupport<T> writeSupport,
    MessageType schema,
    Map<String, String> extraMetaData,
    long blockSize,
    CompressionCodecName codec,
    boolean validating,
    ParquetProperties props,
    MemoryManager memoryManager,
    Configuration conf) {
  this.codecFactory = new CodecFactory(conf, props.getPageSizeThreshold());
  internalWriter = new InternalParquetRecordWriter<T>(w, writeSupport, schema,
      extraMetaData, blockSize, codecFactory.getCompressor(codec), validating,
      props);
  this.memoryManager = Objects.requireNonNull(memoryManager, "memoryManager cannot be null");
  memoryManager.addWriter(internalWriter, blockSize);
}
 
Example 13
Source Project: parquet-mr   Source File: TestThriftToParquetFileWriter.java    License: Apache License 2.0 5 votes vote down vote up
private ParquetReader<Group> createRecordReader(Path parquetFilePath) throws IOException {
  Configuration configuration = new Configuration(true);

  GroupReadSupport readSupport = new GroupReadSupport();
  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
  MessageType schema = readFooter.getFileMetaData().getSchema();

  readSupport.init(configuration, null, schema);
  return new ParquetReader<Group>(parquetFilePath, readSupport);
}
 
Example 14
Source Project: parquet-mr   Source File: GlobalMetaData.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * @param schema the union of the schemas for all the files
 * @param keyValueMetaData the merged app specific metadata
 * @param createdBy the description of the library that created the file
 *
 * @throws NullPointerException if schema or keyValueMetaData is {@code null}
 */
public GlobalMetaData(MessageType schema, Map<String, Set<String>> keyValueMetaData, Set<String> createdBy) {
  super();
  this.schema = Objects.requireNonNull(schema, "schema cannot be null");
  this.keyValueMetaData = unmodifiableMap(Objects
      .requireNonNull(keyValueMetaData, "keyValueMetaData cannot be null"));
  this.createdBy = createdBy;
}
 
Example 15
Source Project: flink   Source File: ParquetPojoInputFormat.java    License: Apache License 2.0 5 votes vote down vote up
public ParquetPojoInputFormat(Path filePath, MessageType messageType, PojoTypeInfo<E> pojoTypeInfo) {
	super(filePath, messageType);
	this.pojoTypeClass = pojoTypeInfo.getTypeClass();
	this.typeSerializer = pojoTypeInfo.createSerializer(new ExecutionConfig());
	final Map<String, Field> fieldMap = new HashMap<>();
	findAllFields(pojoTypeClass, fieldMap);
	selectFields(fieldMap.keySet().toArray(new String[0]));
}
 
Example 16
Source Project: Bats   Source File: ParquetRecordWriter.java    License: Apache License 2.0 5 votes vote down vote up
private void newSchema() throws IOException {
  List<Type> types = Lists.newArrayList();
  for (MaterializedField field : batchSchema) {
    if (field.getName().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) {
      continue;
    }
    types.add(getType(field));
  }
  schema = new MessageType("root", types);

  // We don't want this number to be too small, ideally we divide the block equally across the columns.
  // It is unlikely all columns are going to be the same size.
  // Its value is likely below Integer.MAX_VALUE (2GB), although rowGroupSize is a long type.
  // Therefore this size is cast to int, since allocating byte array in under layer needs to
  // limit the array size in an int scope.
  int initialBlockBufferSize = this.schema.getColumns().size() > 0 ?
      max(MINIMUM_BUFFER_SIZE, blockSize / this.schema.getColumns().size() / 5) : MINIMUM_BUFFER_SIZE;
  // We don't want this number to be too small either. Ideally, slightly bigger than the page size,
  // but not bigger than the block buffer
  int initialPageBufferSize = max(MINIMUM_BUFFER_SIZE, min(pageSize + pageSize / 10, initialBlockBufferSize));
  // TODO: Use initialSlabSize from ParquetProperties once drill will be updated to the latest version of Parquet library
  int initialSlabSize = CapacityByteArrayOutputStream.initialSlabSizeHeuristic(64, pageSize, 10);
  // TODO: Replace ParquetColumnChunkPageWriteStore with ColumnChunkPageWriteStore from parquet library
  // once PARQUET-1006 will be resolved
  pageStore = new ParquetColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, initialSlabSize,
      pageSize, new ParquetDirectByteBufferAllocator(oContext));
  ParquetProperties parquetProperties = ParquetProperties.builder()
      .withPageSize(pageSize)
      .withDictionaryEncoding(enableDictionary)
      .withDictionaryPageSize(initialPageBufferSize)
      .withWriterVersion(writerVersion)
      .withAllocator(new ParquetDirectByteBufferAllocator(oContext))
      .withValuesWriterFactory(new DefaultV1ValuesWriterFactory())
      .build();
  store = new ColumnWriteStoreV1(pageStore, parquetProperties);
  MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema);
  consumer = columnIO.getRecordWriter(store);
  setUp(schema, consumer);
}
 
Example 17
Source Project: flink   Source File: ParquetTableSourceTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testGetReturnType() {
	MessageType nestedSchema = SCHEMA_CONVERTER.convert(TestUtil.NESTED_SCHEMA);
	ParquetTableSource parquetTableSource = ParquetTableSource.builder()
		.path("dummy-path")
		.forParquetSchema(nestedSchema)
		.build();

	TypeInformation<Row> returnType = parquetTableSource.getReturnType();
	assertNotNull(returnType);
	assertTrue(returnType instanceof RowTypeInfo);
	RowTypeInfo rowType = (RowTypeInfo) returnType;
	assertEquals(NESTED_ROW_TYPE, rowType);
}
 
Example 18
Source Project: presto   Source File: ParquetRecordWriterUtil.java    License: Apache License 2.0 5 votes vote down vote up
private static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties)
        throws IOException
{
    if (conf.get(DataWritableWriteSupport.PARQUET_HIVE_SCHEMA) == null) {
        List<String> columnNames = Splitter.on(',').splitToList(properties.getProperty(IOConstants.COLUMNS));
        List<TypeInfo> columnTypes = getTypeInfosFromTypeString(properties.getProperty(IOConstants.COLUMNS_TYPES));
        MessageType schema = HiveSchemaConverter.convert(columnNames, columnTypes);
        setParquetSchema(conf, schema);
    }

    ParquetOutputFormat<ParquetHiveRecord> outputFormat = new ParquetOutputFormat<>(new DataWritableWriteSupport());

    return new ParquetRecordWriterWrapper(outputFormat, conf, target.toString(), Reporter.NULL, properties);
}
 
Example 19
Source Project: parquet-mr   Source File: CompressionConveterTest.java    License: Apache License 2.0 5 votes vote down vote up
private String createParquetFile(Configuration conf, Map<String, String> extraMeta, int numRecord, String prefix, String codec,
                                       ParquetProperties.WriterVersion writerVersion, int pageSize, TestDocs testDocs) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, BINARY, "Backward"),
      new PrimitiveType(REPEATED, BINARY, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file))
    .withConf(conf)
    .withWriterVersion(writerVersion)
    .withExtraMetaData(extraMeta)
    .withDictionaryEncoding("DocId", true)
    .withValidation(true)
    .enablePageWriteChecksum()
    .withPageSize(pageSize)
    .withCompressionCodec(CompressionCodecName.valueOf(codec));
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", testDocs.docId[i]);
      g.add("Name", testDocs.name[i]);
      g.add("Gender", testDocs.gender[i]);
      Group links = g.addGroup("Links");
      links.add(0, testDocs.linkBackward[i]);
      links.add(1, testDocs.linkForward[i]);
      writer.write(g);
    }
  }

  return file;
}
 
Example 20
Source Project: presto   Source File: TestParquetPredicateUtils.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testParquetTupleDomainPrimitiveArray()
{
    HiveColumnHandle columnHandle = createBaseColumn("my_array", 0, HiveType.valueOf("array<int>"), new ArrayType(INTEGER), REGULAR, Optional.empty());
    TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(new ArrayType(INTEGER))));

    MessageType fileSchema = new MessageType("hive_schema",
            new GroupType(OPTIONAL, "my_array",
                    new GroupType(REPEATED, "bag", new PrimitiveType(OPTIONAL, INT32, "array_element"))));

    Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema);
    TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain, fileSchema, true);
    assertTrue(tupleDomain.isAll());
}
 
Example 21
Source Project: parquet-mr   Source File: TestJsonRecordFormatter.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testFlatSchemaWithArrays() throws Exception {
  SimpleRecord simple = new SimpleRecord();
  MessageType schema = new MessageType("schema",
    new PrimitiveType(Type.Repetition.REQUIRED, PrimitiveType.PrimitiveTypeName.BINARY, "reqd"),
    new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.DOUBLE, "opt"),
    new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.INT32, "odd"),
    new PrimitiveType(Type.Repetition.REPEATED, PrimitiveType.PrimitiveTypeName.INT64, "even")
  );

  simple.values.add(kv("reqd", "a required value"));
  simple.values.add(kv("opt", 1.2345));

  simple.values.add(kv("odd", 1));
  simple.values.add(kv("odd", 3));
  simple.values.add(kv("odd", 5));
  simple.values.add(kv("odd", 7));
  simple.values.add(kv("odd", 9));

  simple.values.add(kv("even", 2));
  simple.values.add(kv("even", 4));
  simple.values.add(kv("even", 6));
  simple.values.add(kv("even", 8));
  simple.values.add(kv("even", 10));

  String expected = asJsonString(
    obj(
      entry("reqd", "a required value"),
      entry("opt", 1.2345),
      entry("odd", array(1, 3, 5, 7, 9)),
      entry("even", array(2, 4, 6, 8, 10))
    )
  );

  String actual = JsonRecordFormatter
    .fromSchema(schema)
    .formatRecord(simple);

  assertEquals(expected, actual);
}
 
Example 22
Source Project: flink   Source File: ParquetRecordReaderTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testMapGroup() throws IOException {
	Preconditions.checkState(unWrapSchema(NESTED_SCHEMA.getField("spamMap").schema())
		.getType().equals(Schema.Type.MAP));
	ImmutableMap.Builder<String, String> map = ImmutableMap.builder();
	map.put("testKey", "testValue");

	GenericRecord record = new GenericRecordBuilder(NESTED_SCHEMA)
		.set("foo", 32L)
		.set("spamMap", map.build())
		.build();

	Path path = createTempParquetFile(tempRoot.getRoot(), NESTED_SCHEMA, Collections.singletonList(record));
	MessageType readSchema = (new AvroSchemaConverter()).convert(NESTED_SCHEMA);
	ParquetRecordReader<Row> rowReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema);

	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(path.toUri()), testConfig);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);

	rowReader.initialize(fileReader, testConfig);
	assertFalse(rowReader.reachEnd());

	Row row = rowReader.nextRecord();
	assertEquals(7, row.getArity());

	assertEquals(32L, row.getField(0));
	Map<?, ?> result = (Map<?, ?>) row.getField(1);
	assertEquals(result.get("testKey").toString(), "testValue");
	assertTrue(rowReader.reachEnd());
}
 
Example 23
Source Project: dremio-oss   Source File: ParquetRecordWriter.java    License: Apache License 2.0 5 votes vote down vote up
private MessageType getParquetMessageTypeWithIds(BatchSchema batchSchema, String name) {
  List<Type> types = Lists.newArrayList();
  for (Field field : batchSchema) {
    if (field.getName().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) {
      continue;
    }
    Type childType = getTypeWithId(field, field.getName());
    if (childType != null) {
      types.add(childType);
    }
  }
  Preconditions.checkState(types.size() > 0, "No types for parquet schema");
  return new MessageType(name, types);
}
 
Example 24
Source Project: parquet-mr   Source File: TestColumnReaderImpl.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void test() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  MemPageWriter pageWriter = new MemPageWriter();
  ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter,
      ParquetProperties.builder()
          .withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0)
          .withPageSize(2048).build());
  for (int i = 0; i < rows; i++) {
    columnWriterV2.write(Binary.fromString("bar" + i % 10), 0, 0);
    if ((i + 1) % 1000 == 0) {
      columnWriterV2.writePage();
    }
  }
  columnWriterV2.writePage();
  columnWriterV2.finalizeColumnChunk();
  List<DataPage> pages = pageWriter.getPages();
  int valueCount = 0;
  int rowCount = 0;
  for (DataPage dataPage : pages) {
    valueCount += dataPage.getValueCount();
    rowCount += ((DataPageV2)dataPage).getRowCount();
  }
  assertEquals(rows, rowCount);
  assertEquals(rows, valueCount);
  MemPageReader pageReader = new MemPageReader(rows, pages.iterator(), pageWriter.getDictionaryPage());
  ValidatingConverter converter = new ValidatingConverter();
  ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
  for (int i = 0; i < rows; i++) {
    assertEquals(0, columnReader.getCurrentRepetitionLevel());
    assertEquals(0, columnReader.getCurrentDefinitionLevel());
    columnReader.writeCurrentValueToConverter();
    columnReader.consume();
  }
  assertEquals(rows, converter.count);
}
 
Example 25
Source Project: presto   Source File: ParquetTester.java    License: Apache License 2.0 5 votes vote down vote up
private static void writeParquetColumn(
        JobConf jobConf,
        File outputFile,
        CompressionCodecName compressionCodecName,
        Properties tableProperties,
        SettableStructObjectInspector objectInspector,
        Iterator<?>[] valuesByField,
        Optional<MessageType> parquetSchema,
        boolean singleLevelArray)
        throws Exception
{
    RecordWriter recordWriter = new TestMapredParquetOutputFormat(parquetSchema, singleLevelArray)
            .getHiveRecordWriter(
                    jobConf,
                    new Path(outputFile.toURI()),
                    Text.class,
                    compressionCodecName != UNCOMPRESSED,
                    tableProperties,
                    () -> {});
    Object row = objectInspector.create();
    List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
    while (stream(valuesByField).allMatch(Iterator::hasNext)) {
        for (int field = 0; field < fields.size(); field++) {
            Object value = valuesByField[field].next();
            objectInspector.setStructFieldData(row, fields.get(field), value);
        }
        ParquetHiveSerDe serde = new ParquetHiveSerDe();
        serde.initialize(jobConf, tableProperties, null);
        Writable record = serde.serialize(row, objectInspector);
        recordWriter.write(record);
    }
    recordWriter.close(false);
}
 
Example 26
Source Project: flink   Source File: ParquetColumnarRowSplitReader.java    License: Apache License 2.0 5 votes vote down vote up
public ParquetColumnarRowSplitReader(
		boolean utcTimestamp,
		boolean caseSensitive,
		Configuration conf,
		LogicalType[] selectedTypes,
		String[] selectedFieldNames,
		ColumnBatchGenerator generator,
		int batchSize,
		Path path,
		long splitStart,
		long splitLength) throws IOException {
	this.utcTimestamp = utcTimestamp;
	this.selectedTypes = selectedTypes;
	this.batchSize = batchSize;
	// then we need to apply the predicate push down filter
	ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength));
	MessageType fileSchema = footer.getFileMetaData().getSchema();
	FilterCompat.Filter filter = getFilter(conf);
	List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);

	this.fileSchema = footer.getFileMetaData().getSchema();
	this.requestedSchema = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive);
	this.reader = new ParquetFileReader(
			conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());

	long totalRowCount = 0;
	for (BlockMetaData block : blocks) {
		totalRowCount += block.getRowCount();
	}
	this.totalRowCount = totalRowCount;
	this.nextRow = 0;
	this.rowsInBatch = 0;
	this.rowsReturned = 0;

	checkSchema();

	this.writableVectors = createWritableVectors();
	this.columnarBatch = generator.generate(createReadableVectors());
	this.row = new ColumnarRowData(columnarBatch);
}
 
Example 27
Source Project: parquet-mr   Source File: TestTupleRecordConsumer.java    License: Apache License 2.0 5 votes vote down vote up
private void testFromGroups(String pigSchemaString, List<Group> input) throws ParserException {
  List<Tuple> tuples = new ArrayList<Tuple>();
  MessageType schema = getMessageType(pigSchemaString);
  RecordMaterializer<Tuple> pigRecordConsumer = newPigRecordConsumer(pigSchemaString);
  GroupWriter groupWriter = new GroupWriter(new RecordConsumerLoggingWrapper(new ConverterConsumer(pigRecordConsumer.getRootConverter(), schema)), schema);

  for (Group group : input) {
    groupWriter.write(group);
    final Tuple tuple = pigRecordConsumer.getCurrentRecord();
    tuples.add(tuple);
    LOG.debug("in: {}\nout:{}", group, tuple);
  }

  List<Group> groups = new ArrayList<Group>();
  GroupRecordConverter recordConsumer = new GroupRecordConverter(schema);
  TupleWriteSupport tupleWriter = newTupleWriter(pigSchemaString, recordConsumer);
  for (Tuple t : tuples) {
    LOG.debug("{}", t);
    tupleWriter.write(t);
    groups.add(recordConsumer.getCurrentRecord());
  }

  assertEquals(input.size(), groups.size());
  for (int i = 0; i < input.size(); i++) {
    Group in = input.get(i);
    LOG.debug("{}", in);
    Group out = groups.get(i);
    assertEquals(in.toString(), out.toString());
  }
}
 
Example 28
Source Project: iceberg   Source File: TypeToMessageType.java    License: Apache License 2.0 5 votes vote down vote up
public MessageType convert(Schema schema, String name) {
  Types.MessageTypeBuilder builder = Types.buildMessage();

  for (NestedField field : schema.columns()) {
    builder.addField(field(field));
  }

  return builder.named(name);
}
 
Example 29
Source Project: iceberg   Source File: GenericParquetReaders.java    License: Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public static ParquetValueReader<GenericRecord> buildReader(Schema expectedSchema,
                                                            MessageType fileSchema) {
  if (hasIds(fileSchema)) {
    return (ParquetValueReader<GenericRecord>)
        TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema,
            new ReadBuilder(fileSchema));
  } else {
    return (ParquetValueReader<GenericRecord>)
        TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema,
            new FallbackReadBuilder(fileSchema));
  }
}
 
Example 30
private void testCase(String testCaseName)
    throws SchemaConversionException, DataConversionException {
  JsonObject test = testCases.get(testCaseName).getAsJsonObject();
  parquetConverter = new JsonIntermediateToParquetGroupConverter();

  MessageType schema = parquetConverter.convertSchema(test.get("schema").getAsJsonArray(), workUnit);
  Group record =
      parquetConverter.convertRecord(schema, test.get("record").getAsJsonObject(), workUnit).iterator().next();
  assertEqualsIgnoreSpaces(schema.toString(), test.get("expectedSchema").getAsString());
  assertEqualsIgnoreSpaces(record.toString(), test.get("expectedRecord").getAsString());
}