org.apache.parquet.schema.MessageTypeParser Java Examples

The following examples show how to use org.apache.parquet.schema.MessageTypeParser. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestThriftSchemaConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testConvertStructCreatedViaDeprecatedConstructor() {
  String expected = "message ParquetSchema {\n" +
    "  required binary a (UTF8) = 1;\n" +
    "  required binary b (UTF8) = 2;\n" +
    "}\n";

  ThriftSchemaConverter converter = new ThriftSchemaConverter();

  StructType structType = new StructType(
    Arrays.asList(
      new ThriftField("a", (short) 1, REQUIRED, new ThriftType.StringType()),
      new ThriftField("b", (short) 2, REQUIRED, new ThriftType.StringType())
    )
  );

  final MessageType converted = converter.convert(structType);
  assertEquals(MessageTypeParser.parseMessageType(expected), converted);
}
 
Example #2
Source File: TestThriftSchemaConverter.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testToMessageType() throws Exception {
  String expected =
          "message ParquetSchema {\n" +
                  "  optional group persons (LIST) = 1 {\n" +
                  "    repeated group persons_tuple {\n" +
                  "      required group name = 1 {\n" +
                  "        optional binary first_name (UTF8) = 1;\n" +
                  "        optional binary last_name (UTF8) = 2;\n" +
                  "      }\n" +
                  "      optional int32 id = 2;\n" +
                  "      optional binary email (UTF8) = 3;\n" +
                  "      optional group phones (LIST) = 4 {\n" +
                  "        repeated group phones_tuple {\n" +
                  "          optional binary number (UTF8) = 1;\n" +
                  "          optional binary type (ENUM) = 2;\n" +
                  "        }\n" +
                  "      }\n" +
                  "    }\n" +
                  "  }\n" +
                  "}";
  ThriftSchemaConverter schemaConverter = new ThriftSchemaConverter();
  final MessageType converted = schemaConverter.convert(AddressBook.class);
  assertEquals(MessageTypeParser.parseMessageType(expected), converted);
}
 
Example #3
Source File: SqlInterpreterTest.java    From zeppelin with Apache License 2.0 6 votes vote down vote up
public File createParquetFile(int[] values,
                              ParquetProperties.WriterVersion version) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".par");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType(
          "message test { "
                  + "required int32 int32_field; "
                  + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(
          path,
          new GroupWriteSupport(),
          CompressionCodecName.UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
  for (int i = 0; i < values.length; i++) {
    writer.write(f.newGroup()
            .append("int32_field", values[i]));
  }
  writer.close();
  return file;
}
 
Example #4
Source File: TestZstandardCodec.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private RunningJob runMapReduceJob(CompressionCodecName codec, JobConf jobConf, Configuration conf, Path parquetPath) throws IOException, ClassNotFoundException, InterruptedException {
  String writeSchema = "message example {\n" +
    "required int32 line;\n" +
    "required binary content;\n" +
    "}";

  FileSystem fileSystem = parquetPath.getFileSystem(conf);
  fileSystem.delete(parquetPath, true);
  jobConf.setInputFormat(TextInputFormat.class);
  TextInputFormat.addInputPath(jobConf, inputPath);
  jobConf.setNumReduceTasks(0);
  jobConf.setOutputFormat(DeprecatedParquetOutputFormat.class);
  DeprecatedParquetOutputFormat.setCompression(jobConf, codec);
  DeprecatedParquetOutputFormat.setOutputPath(jobConf, parquetPath);
  DeprecatedParquetOutputFormat.setWriteSupportClass(jobConf, GroupWriteSupport.class);
  GroupWriteSupport.setSchema(MessageTypeParser.parseMessageType(writeSchema), jobConf);

  jobConf.setMapperClass(TestZstandardCodec.DumpMapper.class);
  return JobClient.runJob(jobConf);
}
 
Example #5
Source File: DeprecatedOutputFormatTest.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
private void runMapReduceJob(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException {

    final FileSystem fileSystem = parquetPath.getFileSystem(conf);
    fileSystem.delete(parquetPath, true);
    fileSystem.delete(outputPath, true);
    {
      jobConf.setInputFormat(TextInputFormat.class);
      TextInputFormat.addInputPath(jobConf, inputPath);
      jobConf.setNumReduceTasks(0);

      jobConf.setOutputFormat(DeprecatedParquetOutputFormat.class);
      DeprecatedParquetOutputFormat.setCompression(jobConf, codec);
      DeprecatedParquetOutputFormat.setOutputPath(jobConf, parquetPath);
      DeprecatedParquetOutputFormat.setWriteSupportClass(jobConf, GroupWriteSupport.class);
      GroupWriteSupport.setSchema(MessageTypeParser.parseMessageType(writeSchema), jobConf);

      jobConf.setMapperClass(DeprecatedMapper.class);
      mapRedJob = JobClient.runJob(jobConf);
    }
  }
 
Example #6
Source File: TestColumnIO.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testOneOfEach() {
  MessageType oneOfEachSchema = MessageTypeParser.parseMessageType(oneOfEach);
  GroupFactory gf = new SimpleGroupFactory(oneOfEachSchema);
  Group g1 = gf.newGroup()
      .append("a", 1l)
      .append("b", 2)
      .append("c", 3.0f)
      .append("d", 4.0d)
      .append("e", true)
      .append("f", Binary.fromString("6"))
      .append("g", new NanoTime(1234, System.currentTimeMillis() * 1000))
      .append("h", Binary.fromString("abc"));

  testSchema(oneOfEachSchema, Arrays.asList(g1));
}
 
Example #7
Source File: TestParquetParser.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testEmbeddedAnnotations() {
  String message = "message EmbeddedMessage {" +
      "  required binary json (JSON);" +
      "  required binary bson (BSON);" +
      "}\n";

  MessageType parsed = MessageTypeParser.parseMessageType(message);
  MessageType expected = Types.buildMessage()
      .required(BINARY).as(JSON).named("json")
      .required(BINARY).as(BSON).named("bson")
      .named("EmbeddedMessage");

  assertEquals(expected, parsed);
  MessageType reparsed = MessageTypeParser.parseMessageType(parsed.toString());
  assertEquals(expected, reparsed);
}
 
Example #8
Source File: TestMemColumn.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testMemColumn() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message msg { required group foo { required int64 bar; } }");
  ColumnDescriptor path = schema.getColumnDescription(new String[] {"foo", "bar"});
  MemPageStore memPageStore = new MemPageStore(10);
  ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
  ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
  columnWriter.write(42l, 0, 0);
  memColumnsStore.endRecord();
  memColumnsStore.flush();

  ColumnReader columnReader = getColumnReader(memPageStore, path, schema);
  for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
    assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
    assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
    assertEquals(columnReader.getLong(), 42);
    columnReader.consume();
  }
}
 
Example #9
Source File: TestMemColumn.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testMemColumnBinary() throws Exception {
  MessageType mt = MessageTypeParser.parseMessageType("message msg { required group foo { required binary bar; } }");
  String[] col = new String[]{"foo", "bar"};
  MemPageStore memPageStore = new MemPageStore(10);

  ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
  ColumnDescriptor path1 = mt.getColumnDescription(col);
  ColumnDescriptor path = path1;

  ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
  columnWriter.write(Binary.fromString("42"), 0, 0);
  memColumnsStore.endRecord();
  memColumnsStore.flush();

  ColumnReader columnReader = getColumnReader(memPageStore, path, mt);
  for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
    assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
    assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
    assertEquals(columnReader.getBinary().toStringUsingUTF8(), "42");
    columnReader.consume();
  }
}
 
Example #10
Source File: TestMemColumn.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testMemColumnSeveralPages() throws Exception {
  MessageType mt = MessageTypeParser.parseMessageType("message msg { required group foo { required int64 bar; } }");
  String[] col = new String[]{"foo", "bar"};
  MemPageStore memPageStore = new MemPageStore(10);
  ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
  ColumnDescriptor path1 = mt.getColumnDescription(col);
  ColumnDescriptor path = path1;

  ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
  for (int i = 0; i < 2000; i++) {
    columnWriter.write(42l, 0, 0);
    memColumnsStore.endRecord();
  }
  memColumnsStore.flush();

  ColumnReader columnReader = getColumnReader(memPageStore, path, mt);
  for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
    assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
    assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
    assertEquals(columnReader.getLong(), 42);
    columnReader.consume();
  }
}
 
Example #11
Source File: TestPigSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void testFixedConversion(String schemaString, String pigSchemaString)
    throws Exception {
  Schema expectedPigSchema = Utils.getSchemaFromString(pigSchemaString);
  MessageType parquetSchema = MessageTypeParser.parseMessageType(schemaString);
  Schema pigSchema = pigSchemaConverter.convert(parquetSchema);
  assertEquals("converting " + schemaString + " to " + pigSchemaString,
               expectedPigSchema, pigSchema);
}
 
Example #12
Source File: ParquetFileAccessor.java    From pxf with Apache License 2.0 5 votes vote down vote up
/**
 * Generate parquet schema using schema file
 */
private MessageType readSchemaFile(String schemaFile)
        throws IOException {
    LOG.debug("{}-{}: Using parquet schema from given schema file {}", context.getTransactionId(),
            context.getSegmentId(), schemaFile);
    try (InputStream inputStream = fs.open(new Path(schemaFile))) {
        return MessageTypeParser.parseMessageType(IOUtils.toString(inputStream));
    }
}
 
Example #13
Source File: TestInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() {
  blocks = new ArrayList<BlockMetaData>();
  for (int i = 0; i < 10; i++) {
    blocks.add(newBlock(i * 10, 10));
  }
  schema = MessageTypeParser.parseMessageType("message doc { required binary foo; }");
  fileMetaData = new FileMetaData(schema, new HashMap<String, String>(), "parquet-mr");
}
 
Example #14
Source File: TestInputFormat.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void createParquetFile(File file) throws IOException {
  Path path = new Path(file.toURI());
  Configuration configuration = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;}}");
  String[] columnPath = {"a", "b"};
  ColumnDescriptor c1 = schema.getColumnDescription(columnPath);

  byte[] bytes1 = { 0, 1, 2, 3};
  byte[] bytes2 = { 2, 3, 4, 5};
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  BinaryStatistics stats = new BinaryStatistics();

  ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
  w.start();
  w.startBlock(3);
  w.startColumn(c1, 5, codec);
  w.writeDataPage(2, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(3, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();
  w.startBlock(4);
  w.startColumn(c1, 7, codec);
  w.writeDataPage(7, 4, BytesInput.from(bytes2), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();
  w.end(new HashMap<String, String>());
}
 
Example #15
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteMode() throws Exception {
  File testFile = temp.newFile();
  MessageType schema = MessageTypeParser.parseMessageType(
      "message m { required group a {required binary b;} required group "
      + "c { required int64 d; }}");
  Configuration conf = new Configuration();

  ParquetFileWriter writer = null;
  boolean exceptionThrown = false;
  Path path = new Path(testFile.toURI());
  try {
    writer = new ParquetFileWriter(conf, schema, path,
        ParquetFileWriter.Mode.CREATE);
  } catch(IOException ioe1) {
    exceptionThrown = true;
  }
  assertTrue(exceptionThrown);
  exceptionThrown = false;
  try {
    writer = new ParquetFileWriter(conf, schema, path,
        OVERWRITE);
  } catch(IOException ioe2) {
    exceptionThrown = true;
  }
  assertTrue(!exceptionThrown);
  testFile.delete();
}
 
Example #16
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testBloomFilterWriteRead() throws Exception {
  MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
  File testFile = temp.newFile();
  testFile.delete();
  Path path = new Path(testFile.toURI());
  Configuration configuration = new Configuration();
  configuration.set("parquet.bloom.filter.column.names", "foo");
  String[] colPath = {"foo"};
  ColumnDescriptor col = schema.getColumnDescription(colPath);
  BinaryStatistics stats1 = new BinaryStatistics();
  ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
  w.start();
  w.startBlock(3);
  w.startColumn(col, 5, CODEC);
  w.writeDataPage(2, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(3, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  BloomFilter blockSplitBloomFilter = new BlockSplitBloomFilter(0);
  blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("hello")));
  blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("world")));
  w.addBloomFilter("foo", blockSplitBloomFilter);
  w.endBlock();
  w.end(new HashMap<>());
  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
  ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path,
    Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(colPath)));
  BloomFilterReader bloomFilterReader = r.getBloomFilterDataReader(readFooter.getBlocks().get(0));
  BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(readFooter.getBlocks().get(0).getColumns().get(0));
  assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("hello"))));
  assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("world"))));
}
 
Example #17
Source File: TestPigSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void testConversion(String pigSchemaString, String schemaString) throws Exception {
  Schema pigSchema = Utils.getSchemaFromString(pigSchemaString);
  MessageType schema = pigSchemaConverter.convert(pigSchema);
  MessageType expectedMT = MessageTypeParser.parseMessageType(schemaString);
  assertEquals("converting "+pigSchemaString+" to "+schemaString, expectedMT, schema);

  MessageType filtered = pigSchemaConverter.filter(schema, pigSchema, null);
  assertEquals("converting "+pigSchemaString+" to "+schemaString+" and filtering", schema.toString(), filtered.toString());
}
 
Example #18
Source File: ProtoSchemaConverterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * Converts given pbClass to parquet schema and compares it with expected parquet schema.
 */
private void testConversion(Class<? extends Message> pbClass, String parquetSchemaString, boolean parquetSpecsCompliant) throws
        Exception {
  ProtoSchemaConverter protoSchemaConverter = new ProtoSchemaConverter(parquetSpecsCompliant);
  MessageType schema = protoSchemaConverter.convert(pbClass);
  MessageType expectedMT = MessageTypeParser.parseMessageType(parquetSchemaString);
  assertEquals(expectedMT.toString(), schema.toString());
}
 
Example #19
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteReadStatisticsAllNulls() throws Exception {
  // this test assumes statistics will be read
  Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY));

  File testFile = temp.newFile();
  testFile.delete();

  writeSchema = "message example {\n" +
          "required binary content (UTF8);\n" +
          "}";

  Path path = new Path(testFile.toURI());

  MessageType schema = MessageTypeParser.parseMessageType(writeSchema);
  Configuration configuration = new Configuration();
  configuration.setBoolean("parquet.strings.signed-min-max.enabled", true);
  GroupWriteSupport.setSchema(schema, configuration);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(path, configuration, new GroupWriteSupport());

  Group r1 = new SimpleGroup(schema);
  writer.write(r1);
  writer.close();

  ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);

  // assert the statistics object is not empty
  org.apache.parquet.column.statistics.Statistics stats = readFooter.getBlocks().get(0).getColumns().get(0).getStatistics();
  assertFalse("is empty: " + stats, stats.isEmpty());
  // assert the number of nulls are correct for the first block
  assertEquals("nulls: " + stats, 1, stats.getNumNulls());
}
 
Example #20
Source File: TestMemoryManager.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Before
public void setUp() throws Exception {
  parquetOutputFormat = new ParquetOutputFormat(new GroupWriteSupport());

  GroupWriteSupport.setSchema(MessageTypeParser.parseMessageType(writeSchema), conf);
  expectedPoolSize = Math.round((double)
      ManagementFactory.getMemoryMXBean().getHeapMemoryUsage().getMax() *
      MemoryManager.DEFAULT_MEMORY_POOL_RATIO);

  long rowGroupSize = expectedPoolSize / 2;
  conf.setLong(ParquetOutputFormat.BLOCK_SIZE, rowGroupSize);

  // the memory manager is not initialized until a writer is created
  createWriter(0).close(null);
}
 
Example #21
Source File: DeprecatedInputFormatTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void runMapReduceJob(CompressionCodecName codec) throws IOException, ClassNotFoundException, InterruptedException {

    final FileSystem fileSystem = parquetPath.getFileSystem(conf);
    fileSystem.delete(parquetPath, true);
    fileSystem.delete(outputPath, true);
    {
      writeJob = new Job(conf, "write");
      TextInputFormat.addInputPath(writeJob, inputPath);
      writeJob.setInputFormatClass(TextInputFormat.class);
      writeJob.setNumReduceTasks(0);
      ExampleOutputFormat.setCompression(writeJob, codec);
      ExampleOutputFormat.setOutputPath(writeJob, parquetPath);
      writeJob.setOutputFormatClass(ExampleOutputFormat.class);
      writeJob.setMapperClass(ReadMapper.class);
      ExampleOutputFormat.setSchema(
              writeJob,
              MessageTypeParser.parseMessageType(
                      writeSchema));
      writeJob.submit();
      waitForJob(writeJob);
    }
    {
      jobConf.set(ReadSupport.PARQUET_READ_SCHEMA, readSchema);
      jobConf.set(ParquetInputFormat.READ_SUPPORT_CLASS, GroupReadSupport.class.getCanonicalName());
      jobConf.setInputFormat(MyDeprecatedInputFormat.class);
      MyDeprecatedInputFormat.setInputPaths(jobConf, parquetPath);
      jobConf.setOutputFormat(org.apache.hadoop.mapred.TextOutputFormat.class);
      org.apache.hadoop.mapred.TextOutputFormat.setOutputPath(jobConf, outputPath);
      jobConf.setMapperClass(DeprecatedWriteMapper.class);
      jobConf.setNumReduceTasks(0);
      mapRedJob = JobClient.runJob(jobConf);
    }
  }
 
Example #22
Source File: TestHiveSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void testConversion(final String columnNamesStr, final String columnsTypeStr, final String expectedSchema) throws Exception {
  final List<String> columnNames = createHiveColumnsFrom(columnNamesStr);
  final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(columnsTypeStr);
  final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes);
  final MessageType expectedMT = MessageTypeParser.parseMessageType(expectedSchema);
  assertEquals("converting " + columnNamesStr + ": " + columnsTypeStr + " to " + expectedSchema, expectedMT, messageTypeFound);
}
 
Example #23
Source File: ParquetRecordReaderWrapper.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * gets a ParquetInputSplit corresponding to a split given by Hive
 *
 * @param oldSplit The split given by Hive
 * @param conf The JobConf of the Hive job
 * @return a ParquetInputSplit corresponding to the oldSplit
 * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
 */
protected ParquetInputSplit getSplit(
    final InputSplit oldSplit,
    final JobConf conf
    ) throws IOException {
  if (oldSplit instanceof FileSplit) {
    FileSplit fileSplit = (FileSplit) oldSplit;
    final long splitStart = fileSplit.getStart();
    final long splitLength = fileSplit.getLength();
    final Path finalPath = fileSplit.getPath();
    final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());

    final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
    final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
    final ReadContext readContext =
        new DataWritableReadSupport()
          .init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());

    schemaSize = MessageTypeParser.parseMessageType(
          readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)
        ).getFieldCount();
     return new ParquetInputSplit(
              finalPath,
              splitStart,
              splitStart + splitLength,
              splitLength,
              fileSplit.getLocations(),
              null);
  } else {
    throw new IllegalArgumentException("Unknown split type: " + oldSplit);
  }
}
 
Example #24
Source File: DataWritableReadSupport.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 *
 * It creates the hive read support to interpret data from parquet to hive
 *
 * @param configuration // unused
 * @param keyValueMetaData string map of metadata
 * @param fileSchema // unused
 * @param readContext containing the requested schema and the schema of the hive table
 * @return Record Materialize for Hive
 */
@Override
public RecordMaterializer<ArrayWritable> prepareForRead(final Configuration configuration,
    final Map<String, String> keyValueMetaData, final MessageType fileSchema,
        final org.apache.parquet.hadoop.api.ReadSupport.ReadContext readContext) {
  final Map<String, String> metadata = readContext.getReadSupportMetadata();
  if (metadata == null) {
    throw new IllegalStateException("ReadContext not initialized properly. " +
      "Don't know the Hive Schema.");
  }
  final MessageType tableSchema = resolveSchemaAccess(MessageTypeParser.
      parseMessageType(metadata.get(HIVE_SCHEMA_KEY)), fileSchema, configuration);

  return new DataWritableRecordConverter(readContext.getRequestedSchema(), tableSchema);
}
 
Example #25
Source File: TestThriftSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testLogicalTypeConvertion() throws Exception {
  String expected =
    "message ParquetSchema {\n" +
      "  required int32 test_i16 (INTEGER(16,true)) = 1;" +
      "}";
  ThriftSchemaConverter schemaConverter = new ThriftSchemaConverter();
  final MessageType converted = schemaConverter.convert(TestLogicalType.class);
  assertEquals(MessageTypeParser.parseMessageType(expected), converted);
}
 
Example #26
Source File: GroupReadSupportTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test
public void testInitWithPartialSchema() {
  GroupReadSupport s = new GroupReadSupport();
  Configuration configuration = new Configuration();
  Map<String, String> keyValueMetaData = new HashMap<String, String>();
  MessageType fileSchema = MessageTypeParser.parseMessageType(fullSchemaStr);
  MessageType partialSchema = MessageTypeParser.parseMessageType(partialSchemaStr);
  configuration.set(ReadSupport.PARQUET_READ_SCHEMA, partialSchemaStr);

  ReadSupport.ReadContext context = s.init(configuration, keyValueMetaData, fileSchema);
  assertEquals(context.getRequestedSchema(), partialSchema);
}
 
Example #27
Source File: HDFSFactorys.java    From sylph with Apache License 2.0 5 votes vote down vote up
@Override
public HDFSFactory getOrCreate()
{
    requireNonNull(schema, "schema is null");
    requireNonNull(tableName, "必须传入tableName,如表 xxx_log");
    requireNonNull(sinkConfig.getWriteDir(), "必须传入writeTableDir,如: hdfs:///tmp/hive/xxx_log");

    String schemaString = buildSchema(schema.getFields());
    MessageType type = MessageTypeParser.parseMessageType(schemaString);
    return new ParquetFactory(sinkConfig.getWriteDir(), tableName, parquetVersion, type);
}
 
Example #28
Source File: TestAvroSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Test(expected = IllegalArgumentException.class)
public void testParquetMapWithNonStringKeyFails() throws Exception {
  MessageType parquetSchema = MessageTypeParser.parseMessageType(
      "message myrecord {\n" +
          "  required group mymap (MAP) {\n" +
          "    repeated group map (MAP_KEY_VALUE) {\n" +
          "      required int32 key;\n" +
          "      required int32 value;\n" +
          "    }\n" +
          "  }\n" +
          "}\n"
  );
  new AvroSchemaConverter().convert(parquetSchema);
}
 
Example #29
Source File: TestAvroSchemaConverter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void testRoundTripConversion(
    Configuration conf, Schema avroSchema, String schemaString)
    throws Exception {
  AvroSchemaConverter avroSchemaConverter = new AvroSchemaConverter(conf);
  MessageType schema = avroSchemaConverter.convert(avroSchema);
  MessageType expectedMT = MessageTypeParser.parseMessageType(schemaString);
  assertEquals("converting " + schema + " to " + schemaString, expectedMT.toString(),
      schema.toString());
  Schema convertedAvroSchema = avroSchemaConverter.convert(expectedMT);
  assertEquals("converting " + expectedMT + " to " + avroSchema.toString(true),
      avroSchema.toString(), convertedAvroSchema.toString());
}
 
Example #30
Source File: ParquetFilePOJOReaderTest.java    From attic-apex-malhar with Apache License 2.0 5 votes vote down vote up
private static void writeParquetFile(String rawSchema, File outputParquetFile, List<EventRecord> data)
  throws IOException
{
  Path path = new Path(outputParquetFile.toURI());
  MessageType schema = MessageTypeParser.parseMessageType(rawSchema);
  ParquetPOJOWriter writer = new ParquetPOJOWriter(path, schema, EventRecord.class, true);
  for (EventRecord eventRecord : data) {
    writer.write(eventRecord);
  }
  writer.close();
}