org.apache.parquet.schema.MessageType Java Examples

The following examples show how to use org.apache.parquet.schema.MessageType. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * {@link ParquetFileWriter#mergeFooters(Path, List)} expects a fully-qualified
 * path for the root and crashes if a relative one is provided.
 */
@Test
public void testWriteMetadataFileWithRelativeOutputPath() throws IOException {
  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.get(conf);
  Path relativeRoot = new Path("target/_test_relative");
  Path qualifiedRoot = fs.makeQualified(relativeRoot);

  ParquetMetadata mock = Mockito.mock(ParquetMetadata.class);
  FileMetaData fileMetaData = new FileMetaData(
          new MessageType("root1",
              new PrimitiveType(REPEATED, BINARY, "a")),
          new HashMap<String, String>(), "test");
  Mockito.when(mock.getFileMetaData()).thenReturn(fileMetaData);

  List<Footer> footers = new ArrayList<Footer>();
  Footer footer = new Footer(new Path(qualifiedRoot, "one"), mock);
  footers.add(footer);

  // This should not throw an exception
  ParquetFileWriter.writeMetadataFile(conf, relativeRoot, footers, JobSummaryLevel.ALL);
}
 
Example #2
Source File: ParquetResolverTest.java    From pxf with Apache License 2.0 6 votes vote down vote up
private MessageType getParquetSchemaForPrimitiveTypes(Type.Repetition repetition, boolean readCase) {
    List<Type> fields = new ArrayList<>();

    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "s1", OriginalType.UTF8));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "s2", OriginalType.UTF8));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT32, "n1", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.DOUBLE, "d1", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, 16, "dc1", OriginalType.DECIMAL, new DecimalMetadata(38, 18), null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT96, "tm", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.FLOAT, "f", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT64, "bg", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BOOLEAN, "b", null));

    // GPDB only has int16 and not int8 type, so for write tiny numbers int8 are still treated as shorts in16
    OriginalType tinyType = readCase ? OriginalType.INT_8 : OriginalType.INT_16;
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT32, "tn", tinyType));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT32, "sml", OriginalType.INT_16));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "vc1", OriginalType.UTF8));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "c1", OriginalType.UTF8));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.BINARY, "bin", null));

    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT96, "tmtz", null));
    fields.add(new PrimitiveType(repetition, PrimitiveTypeName.INT96, "tmtz2", null));

    return new MessageType("hive_schema", fields);
}
 
Example #3
Source File: ParquetRowInputFormatTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadRowFromSimpleRecord() throws IOException {
	Tuple3<Class<? extends SpecificRecord>, SpecificRecord, Row> simple = TestUtil.getSimpleRecordTestData();
	Path path = TestUtil.createTempParquetFile(
		tempRoot.getRoot(), TestUtil.SIMPLE_SCHEMA, Arrays.asList(simple.f1, simple.f1));
	MessageType simpleType = SCHEMA_CONVERTER.convert(TestUtil.SIMPLE_SCHEMA);

	ParquetRowInputFormat inputFormat = new ParquetRowInputFormat(path, simpleType);
	inputFormat.setRuntimeContext(TestUtil.getMockRuntimeContext());

	FileInputSplit[] splits = inputFormat.createInputSplits(1);
	assertEquals(1, splits.length);
	inputFormat.open(splits[0]);

	Row row = inputFormat.nextRecord(null);
	assertNotNull(row);
	assertEquals(simple.f2, row);

	row = inputFormat.nextRecord(null);
	assertNotNull(row);
	assertEquals(simple.f2, row);

	assertTrue(inputFormat.reachedEnd());
}
 
Example #4
Source File: TestParquetParser.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testEmbeddedAnnotations() {
  String message = "message EmbeddedMessage {" +
      "  required binary json (JSON);" +
      "  required binary bson (BSON);" +
      "}\n";

  MessageType parsed = MessageTypeParser.parseMessageType(message);
  MessageType expected = Types.buildMessage()
      .required(BINARY).as(JSON).named("json")
      .required(BINARY).as(BSON).named("bson")
      .named("EmbeddedMessage");

  assertEquals(expected, parsed);
  MessageType reparsed = MessageTypeParser.parseMessageType(parsed.toString());
  assertEquals(expected, reparsed);
}
 
Example #5
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * FOR TESTING ONLY. This supports testing block padding behavior on the local FS.
 *
 * @param configuration Hadoop configuration
 * @param schema the schema of the data
 * @param file the file to write to
 * @param rowAndBlockSize the row group size
 * @param maxPaddingSize the maximum padding
 * @throws IOException if the file can not be created
 */
ParquetFileWriter(Configuration configuration, MessageType schema,
                  Path file, long rowAndBlockSize, int maxPaddingSize)
    throws IOException {
  FileSystem fs = file.getFileSystem(configuration);
  this.schema = schema;
  this.alignment = PaddingAlignment.get(
      rowAndBlockSize, rowAndBlockSize, maxPaddingSize);
  this.out = HadoopStreams.wrap(
      fs.create(file, true, 8192, fs.getDefaultReplication(file), rowAndBlockSize));
  this.encodingStatsBuilder = new EncodingStats.Builder();
  // no truncation is needed for testing
  this.columnIndexTruncateLength = Integer.MAX_VALUE;
  this.pageWriteChecksumEnabled = ParquetOutputFormat.getPageWriteChecksumEnabled(configuration);
  this.crc = pageWriteChecksumEnabled ? new CRC32() : null;
  this.metadataConverter = new ParquetMetadataConverter(ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
}
 
Example #6
Source File: TestParquetParser.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testIDs() {
  String message =
      "message Message {\n" +
      "  required binary string (UTF8) = 6;\n" +
      "  required int32 i=1;\n" +
      "  required binary s2= 3;\n" +
      "  required binary s3 =4;\n" +
      "}\n";

  MessageType parsed = parseMessageType(message);
  MessageType expected = buildMessage()
      .required(BINARY).as(OriginalType.UTF8).id(6).named("string")
      .required(INT32).id(1).named("i")
      .required(BINARY).id(3).named("s2")
      .required(BINARY).id(4).named("s3")
      .named("Message");

  assertEquals(expected, parsed);
  MessageType reparsed = parseMessageType(parsed.toString());
  assertEquals(expected, reparsed);
}
 
Example #7
Source File: ParquetMetricsRowGroupFilter.java    From iceberg with Apache License 2.0 6 votes vote down vote up
private boolean eval(MessageType fileSchema, BlockMetaData rowGroup) {
  if (rowGroup.getRowCount() <= 0) {
    return ROWS_CANNOT_MATCH;
  }

  this.stats = Maps.newHashMap();
  this.valueCounts = Maps.newHashMap();
  this.conversions = Maps.newHashMap();
  for (ColumnChunkMetaData col : rowGroup.getColumns()) {
    PrimitiveType colType = fileSchema.getType(col.getPath().toArray()).asPrimitiveType();
    if (colType.getId() != null) {
      int id = colType.getId().intValue();
      stats.put(id, col.getStatistics());
      valueCounts.put(id, col.getValueCount());
      conversions.put(id, ParquetConversions.converterFromParquet(colType));
    }
  }

  return ExpressionVisitors.visitEvaluator(expr, this);
}
 
Example #8
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testReadUsingProjectedSchema(){
  MessageType orginalSchema = new MessageType("schema",
          new PrimitiveType(REQUIRED, INT32, "a"),
          new PrimitiveType(REQUIRED, INT32, "b")
  );
  MessageType projectedSchema = new MessageType("schema",
          new PrimitiveType(OPTIONAL, INT32, "b")
  );
  MemPageStore store = new MemPageStore(1);
  SimpleGroupFactory groupFactory = new SimpleGroupFactory(orginalSchema);
  writeGroups(orginalSchema, store, groupFactory.newGroup().append("a", 1).append("b", 2));

  {
    List<Group> groups = new ArrayList<>();
    groups.addAll(readGroups(store, orginalSchema, projectedSchema, 1));
    Object[][] expected = {
            {2},
    };
    validateGroups(groups, expected);
  }
}
 
Example #9
Source File: ParquetMapInputFormatTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
@SuppressWarnings("unchecked")
public void testProjectedReadMapFromNestedRecord() throws IOException {
	Tuple3<Class<? extends SpecificRecord>, SpecificRecord, Row> nested = TestUtil.getNestedRecordTestData();
	Path path = TestUtil.createTempParquetFile(tempRoot.getRoot(), TestUtil.NESTED_SCHEMA, Collections.singletonList(nested.f1));
	MessageType nestedType = SCHEMA_CONVERTER.convert(TestUtil.NESTED_SCHEMA);
	ParquetMapInputFormat inputFormat = new ParquetMapInputFormat(path, nestedType);

	inputFormat.selectFields(Collections.singletonList("nestedMap").toArray(new String[0]));
	inputFormat.setRuntimeContext(TestUtil.getMockRuntimeContext());

	FileInputSplit[] splits = inputFormat.createInputSplits(1);
	assertEquals(1, splits.length);
	inputFormat.open(splits[0]);

	Map map = inputFormat.nextRecord(null);
	assertNotNull(map);
	assertEquals(1, map.size());

	Map<String, String> mapItem = (Map<String, String>) ((Map) map.get("nestedMap")).get("mapItem");
	assertEquals(2, mapItem.size());
	assertEquals("map", mapItem.get("type"));
	assertEquals("hashMap", mapItem.get("value"));
}
 
Example #10
Source File: DumpCommand.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public void execute(CommandLine options) throws Exception {
    super.execute(options);

    String[] args = options.getArgs();
    String input = args[0];

    Configuration conf = new Configuration();
    Path inpath = new Path(input);

    ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inpath, NO_FILTER);
    MessageType schema = metaData.getFileMetaData().getSchema();

    boolean showmd = !options.hasOption('m');
    boolean showdt = !options.hasOption('d');
    boolean cropoutput = !options.hasOption('n');

    Set<String> showColumns = null;
    if (options.hasOption('c')) {
        String[] cols = options.getOptionValues('c');
        showColumns = new HashSet<String>(Arrays.asList(cols));
    }

    PrettyPrintWriter out = prettyPrintWriter(cropoutput);
    dump(out, metaData, schema, inpath, showmd, showdt, showColumns);
}
 
Example #11
Source File: ParquetMapInputFormatTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
@SuppressWarnings("unchecked")
public void testProjectedReadMapFromNestedRecord() throws IOException {
	Tuple3<Class<? extends SpecificRecord>, SpecificRecord, Row> nested = TestUtil.getNestedRecordTestData();
	Path path = TestUtil.createTempParquetFile(tempRoot.getRoot(), TestUtil.NESTED_SCHEMA, Collections.singletonList(nested.f1));
	MessageType nestedType = SCHEMA_CONVERTER.convert(TestUtil.NESTED_SCHEMA);
	ParquetMapInputFormat inputFormat = new ParquetMapInputFormat(path, nestedType);

	inputFormat.selectFields(Collections.singletonList("nestedMap").toArray(new String[0]));
	inputFormat.setRuntimeContext(TestUtil.getMockRuntimeContext());

	FileInputSplit[] splits = inputFormat.createInputSplits(1);
	assertEquals(1, splits.length);
	inputFormat.open(splits[0]);

	Map map = inputFormat.nextRecord(null);
	assertNotNull(map);
	assertEquals(1, map.size());

	Map<String, String> mapItem = (Map<String, String>) ((Map) map.get("nestedMap")).get("mapItem");
	assertEquals(2, mapItem.size());
	assertEquals("map", mapItem.get("type"));
	assertEquals("hashMap", mapItem.get("value"));
}
 
Example #12
Source File: ParquetRecordWriter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 *
 * @param w the file to write to
 * @param writeSupport the class to convert incoming records
 * @param schema the schema of the records
 * @param extraMetaData extra meta data to write in the footer of the file
 * @param blockSize the size of a block in the file (this will be approximate)
 * @param codec the compression codec used to compress the pages
 * @param validating if schema validation should be turned on
 * @param props parquet encoding properties
 */
ParquetRecordWriter(
    ParquetFileWriter w,
    WriteSupport<T> writeSupport,
    MessageType schema,
    Map<String, String> extraMetaData,
    long blockSize,
    CompressionCodecName codec,
    boolean validating,
    ParquetProperties props,
    MemoryManager memoryManager,
    Configuration conf) {
  this.codecFactory = new CodecFactory(conf, props.getPageSizeThreshold());
  internalWriter = new InternalParquetRecordWriter<T>(w, writeSupport, schema,
      extraMetaData, blockSize, codecFactory.getCompressor(codec), validating,
      props);
  this.memoryManager = Objects.requireNonNull(memoryManager, "memoryManager cannot be null");
  memoryManager.addWriter(internalWriter, blockSize);
}
 
Example #13
Source File: ScroogeRecordConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public ScroogeRecordConverter(final Class<T> thriftClass, MessageType parquetSchema, StructType thriftType, Configuration conf) {
  super(new ThriftReader<T>() {
    @SuppressWarnings("unchecked")
    ThriftStructCodec<T> codec = (ThriftStructCodec<T>) getCodec(thriftClass);
    @Override
    public T readOneRecord(TProtocol protocol) throws TException {
        return codec.decode(protocol);
    }
  }, thriftClass.getSimpleName(), parquetSchema, thriftType, conf);
}
 
Example #14
Source File: TestThriftSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogicalTypeConvertion() throws Exception {
  String expected =
    "message ParquetSchema {\n" +
      "  required int32 test_i16 (INTEGER(16,true)) = 1;" +
      "}";
  ThriftSchemaConverter schemaConverter = new ThriftSchemaConverter();
  final MessageType converted = schemaConverter.convert(TestLogicalType.class);
  assertEquals(MessageTypeParser.parseMessageType(expected), converted);
}
 
Example #15
Source File: PredicateUtils.java    From presto with Apache License 2.0 5 votes vote down vote up
public static Predicate buildPredicate(MessageType requestedSchema, TupleDomain<ColumnDescriptor> parquetTupleDomain, Map<List<String>, RichColumnDescriptor> descriptorsByPath)
{
    ImmutableList.Builder<RichColumnDescriptor> columnReferences = ImmutableList.builder();
    for (String[] paths : requestedSchema.getPaths()) {
        RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(paths));
        if (descriptor != null) {
            columnReferences.add(descriptor);
        }
    }
    return new TupleDomainParquetPredicate(parquetTupleDomain, columnReferences.build());
}
 
Example #16
Source File: TestThriftToParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private ParquetReader<Group> createRecordReader(Path parquetFilePath) throws IOException {
  Configuration configuration = new Configuration(true);

  GroupReadSupport readSupport = new GroupReadSupport();
  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
  MessageType schema = readFooter.getFileMetaData().getSchema();

  readSupport.init(configuration, null, schema);
  return new ParquetReader<Group>(parquetFilePath, readSupport);
}
 
Example #17
Source File: SparkModelParser.java    From ignite with Apache License 2.0 5 votes vote down vote up
/**
 * Load Decision Tree model.
 *
 * @param pathToMdl Path to model.
 * @param learningEnvironment Learning environment.
 */
private static Model loadDecisionTreeModel(String pathToMdl, LearningEnvironment learningEnvironment) {
    try (ParquetFileReader r = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(pathToMdl), new Configuration()))) {
        PageReadStore pages;

        final MessageType schema = r.getFooter().getFileMetaData().getSchema();
        final MessageColumnIO colIO = new ColumnIOFactory().getColumnIO(schema);
        final Map<Integer, NodeData> nodes = new TreeMap<>();

        while (null != (pages = r.readNextRowGroup())) {
            final long rows = pages.getRowCount();
            final RecordReader recordReader = colIO.getRecordReader(pages, new GroupRecordConverter(schema));

            for (int i = 0; i < rows; i++) {
                final SimpleGroup g = (SimpleGroup)recordReader.read();
                NodeData nodeData = extractNodeDataFromParquetRow(g);
                nodes.put(nodeData.id, nodeData);
            }
        }
        return buildDecisionTreeModel(nodes);
    }
    catch (IOException e) {
        String msg = "Error reading parquet file: " + e.getMessage();
        learningEnvironment.logger().log(MLLogger.VerboseLevel.HIGH, msg);
        e.printStackTrace();
    }
    return null;
}
 
Example #18
Source File: ParquetInputFormat.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public void open(FileInputSplit split) throws IOException {
	// reset the flag when open a new split
	this.skipThisSplit = false;
	org.apache.hadoop.conf.Configuration configuration = new org.apache.hadoop.conf.Configuration();
	InputFile inputFile =
		HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(split.getPath().toUri()), configuration);
	ParquetReadOptions options = ParquetReadOptions.builder().build();
	ParquetFileReader fileReader = new ParquetFileReader(inputFile, options);
	MessageType fileSchema = fileReader.getFileMetaData().getSchema();
	MessageType readSchema = getReadSchema(fileSchema, split.getPath());
	if (skipThisSplit) {
		LOG.warn(String.format(
			"Escaped the file split [%s] due to mismatch of file schema to expected result schema",
			split.getPath().toString()));
	} else {
		this.parquetRecordReader = new ParquetRecordReader<>(new RowReadSupport(), readSchema,
			filterPredicate == null ? FilterCompat.NOOP : FilterCompat.get(filterPredicate));
		this.parquetRecordReader.initialize(fileReader, configuration);
		this.parquetRecordReader.setSkipCorruptedRecord(this.skipCorruptedRecord);

		if (this.recordConsumed == null) {
			this.recordConsumed = getRuntimeContext().getMetricGroup().counter("parquet-records-consumed");
		}

		LOG.debug(String.format("Open ParquetInputFormat with FileInputSplit [%s]", split.getPath().toString()));
	}
}
 
Example #19
Source File: Parquet.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
private <T> WriteSupport<T> getWriteSupport(MessageType type) {
  if (writeSupport != null) {
    return (WriteSupport<T>) writeSupport;
  } else {
    return new AvroWriteSupport<>(
        type,
        ParquetAvro.parquetAvroSchema(AvroSchemaUtil.convert(schema, name)),
        ParquetAvro.DEFAULT_MODEL);
  }
}
 
Example #20
Source File: TestHiveSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void testConversion(final String columnNamesStr, final String columnsTypeStr, final String expectedSchema) throws Exception {
  final List<String> columnNames = createHiveColumnsFrom(columnNamesStr);
  final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(columnsTypeStr);
  final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes);
  final MessageType expectedMT = MessageTypeParser.parseMessageType(expectedSchema);
  assertEquals("converting " + columnNamesStr + ": " + columnsTypeStr + " to " + expectedSchema, expectedMT, messageTypeFound);
}
 
Example #21
Source File: ParquetReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public ParquetReader(InputFile input, Schema expectedSchema, ParquetReadOptions options,
                     Function<MessageType, ParquetValueReader<?>> readerFunc, NameMapping nameMapping,
                     Expression filter, boolean reuseContainers, boolean caseSensitive) {
  this.input = input;
  this.expectedSchema = expectedSchema;
  this.options = options;
  this.readerFunc = readerFunc;
  // replace alwaysTrue with null to avoid extra work evaluating a trivial filter
  this.filter = filter == Expressions.alwaysTrue() ? null : filter;
  this.reuseContainers = reuseContainers;
  this.caseSensitive = caseSensitive;
  this.nameMapping = nameMapping;
}
 
Example #22
Source File: ParquetMetadataConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private List<ColumnOrder> getColumnOrders(MessageType schema) {
  List<ColumnOrder> columnOrders = new ArrayList<>();
  // Currently, only TypeDefinedOrder is supported, so we create a column order for each columns with
  // TypeDefinedOrder even if some types (e.g. INT96) have undefined column orders.
  for (int i = 0, n = schema.getPaths().size(); i < n; ++i) {
    ColumnOrder columnOrder = new ColumnOrder();
    columnOrder.setTYPE_ORDER(TYPE_DEFINED_ORDER);
    columnOrders.add(columnOrder);
  }
  return columnOrders;
}
 
Example #23
Source File: SchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Creates an Arrow Schema from an Parquet one and returns the mapping
 * @param parquetSchema the provided Parquet Schema
 * @return the mapping between the 2
 */
public SchemaMapping fromParquet(MessageType parquetSchema) {
  List<Type> fields = parquetSchema.getFields();
  List<TypeMapping> mappings = fromParquet(fields);
  List<Field> arrowFields = fields(mappings);
  return new SchemaMapping(new Schema(arrowFields), parquetSchema, mappings);
}
 
Example #24
Source File: ParquetPojoInputFormat.java    From flink with Apache License 2.0 5 votes vote down vote up
public ParquetPojoInputFormat(Path filePath, MessageType messageType, PojoTypeInfo<E> pojoTypeInfo) {
	super(filePath, messageType);
	this.pojoTypeClass = pojoTypeInfo.getTypeClass();
	this.typeSerializer = pojoTypeInfo.createSerializer(new ExecutionConfig());
	final Map<String, Field> fieldMap = new HashMap<>();
	findAllFields(pojoTypeClass, fieldMap);
	selectFields(fieldMap.keySet().toArray(new String[0]));
}
 
Example #25
Source File: VectorizedSparkParquetReaders.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static ColumnarBatchReader buildReader(
    Schema expectedSchema,
    MessageType fileSchema,
    boolean setArrowValidityVector) {
  return (ColumnarBatchReader)
      TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema,
          new VectorizedReaderBuilder(expectedSchema, fileSchema, setArrowValidityVector));
}
 
Example #26
Source File: JsonIntermediateToParquetGroupConverterTest.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
private void testCase(String testCaseName)
    throws SchemaConversionException, DataConversionException {
  JsonObject test = testCases.get(testCaseName).getAsJsonObject();
  parquetConverter = new JsonIntermediateToParquetGroupConverter();

  MessageType schema = parquetConverter.convertSchema(test.get("schema").getAsJsonArray(), workUnit);
  Group record =
      parquetConverter.convertRecord(schema, test.get("record").getAsJsonObject(), workUnit).iterator().next();
  assertEqualsIgnoreSpaces(schema.toString(), test.get("expectedSchema").getAsString());
  assertEqualsIgnoreSpaces(record.toString(), test.get("expectedRecord").getAsString());
}
 
Example #27
Source File: ParquetColumnarRowSplitReader.java    From flink with Apache License 2.0 5 votes vote down vote up
public ParquetColumnarRowSplitReader(
		boolean utcTimestamp,
		boolean caseSensitive,
		Configuration conf,
		LogicalType[] selectedTypes,
		String[] selectedFieldNames,
		ColumnBatchGenerator generator,
		int batchSize,
		Path path,
		long splitStart,
		long splitLength) throws IOException {
	this.utcTimestamp = utcTimestamp;
	this.selectedTypes = selectedTypes;
	this.batchSize = batchSize;
	// then we need to apply the predicate push down filter
	ParquetMetadata footer = readFooter(conf, path, range(splitStart, splitStart + splitLength));
	MessageType fileSchema = footer.getFileMetaData().getSchema();
	FilterCompat.Filter filter = getFilter(conf);
	List<BlockMetaData> blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);

	this.fileSchema = footer.getFileMetaData().getSchema();
	this.requestedSchema = clipParquetSchema(fileSchema, selectedFieldNames, caseSensitive);
	this.reader = new ParquetFileReader(
			conf, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());

	long totalRowCount = 0;
	for (BlockMetaData block : blocks) {
		totalRowCount += block.getRowCount();
	}
	this.totalRowCount = totalRowCount;
	this.nextRow = 0;
	this.rowsInBatch = 0;
	this.rowsReturned = 0;

	checkSchema();

	this.writableVectors = createWritableVectors();
	this.columnarBatch = generator.generate(createReadableVectors());
	this.row = new ColumnarRowData(columnarBatch);
}
 
Example #28
Source File: TestTupleRecordConsumer.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void testFromGroups(String pigSchemaString, List<Group> input) throws ParserException {
  List<Tuple> tuples = new ArrayList<Tuple>();
  MessageType schema = getMessageType(pigSchemaString);
  RecordMaterializer<Tuple> pigRecordConsumer = newPigRecordConsumer(pigSchemaString);
  GroupWriter groupWriter = new GroupWriter(new RecordConsumerLoggingWrapper(new ConverterConsumer(pigRecordConsumer.getRootConverter(), schema)), schema);

  for (Group group : input) {
    groupWriter.write(group);
    final Tuple tuple = pigRecordConsumer.getCurrentRecord();
    tuples.add(tuple);
    LOG.debug("in: {}\nout:{}", group, tuple);
  }

  List<Group> groups = new ArrayList<Group>();
  GroupRecordConverter recordConsumer = new GroupRecordConverter(schema);
  TupleWriteSupport tupleWriter = newTupleWriter(pigSchemaString, recordConsumer);
  for (Tuple t : tuples) {
    LOG.debug("{}", t);
    tupleWriter.write(t);
    groups.add(recordConsumer.getCurrentRecord());
  }

  assertEquals(input.size(), groups.size());
  for (int i = 0; i < input.size(); i++) {
    Group in = input.get(i);
    LOG.debug("{}", in);
    Group out = groups.get(i);
    assertEquals(in.toString(), out.toString());
  }
}
 
Example #29
Source File: TypeToMessageType.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public MessageType convert(Schema schema, String name) {
  Types.MessageTypeBuilder builder = Types.buildMessage();

  for (NestedField field : schema.columns()) {
    builder.addField(field(field));
  }

  return builder.named(name);
}
 
Example #30
Source File: GenericParquetReaders.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public static ParquetValueReader<GenericRecord> buildReader(Schema expectedSchema,
                                                            MessageType fileSchema) {
  if (hasIds(fileSchema)) {
    return (ParquetValueReader<GenericRecord>)
        TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema,
            new ReadBuilder(fileSchema));
  } else {
    return (ParquetValueReader<GenericRecord>)
        TypeWithSchemaVisitor.visit(expectedSchema.asStruct(), fileSchema,
            new FallbackReadBuilder(fileSchema));
  }
}