org.apache.parquet.example.data.Group Java Examples

The following examples show how to use org.apache.parquet.example.data.Group. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: TestColumnIndexes.java From parquet-mr with Apache License 2.0

6 votes

public Path write(Path directory) throws IOException {
  Path file = new Path(directory, "testColumnIndexes_" + this + ".parquet");
  Random random = new Random(seed);
  int recordCount = random.nextInt(MAX_TOTAL_ROWS) + 1;
  List<Supplier<?>> generators = buildGenerators(recordCount, random);
  Configuration conf = new Configuration();
  ParquetOutputFormat.setColumnIndexTruncateLength(conf, columnIndexTruncateLength);
  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .withType(SCHEMA)
      .withPageRowCountLimit(pageRowCountLimit)
      .withConf(conf)
      .build()) {
    for (int i = 0; i < recordCount; i++) {
      writer.write(createGroup(generators, random));
    }
  }
  return file;
}

Example #2

Source File: ParquetResolver.java From pxf with Apache License 2.0

6 votes

/**
 * Constructs and sets the fields of a {@link OneRow}.
 *
 * @param record list of {@link OneField}
 * @return the constructed {@link OneRow}
 * @throws IOException if constructing a row from the fields failed
 */
@Override
public OneRow setFields(List<OneField> record) throws IOException {
    validateSchema();
    Group group = groupFactory.newGroup();
    for (int i = 0; i < record.size(); i++) {
        OneField field = record.get(i);
        ColumnDescriptor columnDescriptor = context.getTupleDescription().get(i);

        /*
         * We need to right trim the incoming value from Greenplum. This is
         * consistent with the behaviour in Hive, where char fields are right
         * trimmed during write. Note that String and varchar Hive types are
         * not right trimmed. Hive does not trim tabs or newlines
         */
        if (columnDescriptor.getDataType() == DataType.BPCHAR && field.val instanceof String) {
            field.val = Utilities.rightTrimWhiteSpace((String) field.val);
        }
        fillGroup(i, field, group, schema.getType(i));
    }
    return new OneRow(null, group);
}

Example #3

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

6 votes

private void validateColumns(String inputFile, List<String> prunePaths) throws IOException {
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(inputFile)).withConf(conf).build();
  for (int i = 0; i < numRecord; i++) {
    Group group = reader.read();
    if (!prunePaths.contains("DocId")) {
      assertEquals(1l, group.getLong("DocId", 0));
    }
    if (!prunePaths.contains("Name")) {
      assertEquals("foo", group.getBinary("Name", 0).toStringUsingUTF8());
    }
    if (!prunePaths.contains("Gender")) {
      assertEquals("male", group.getBinary("Gender", 0).toStringUsingUTF8());
    }
    if (!prunePaths.contains("Links")) {
      Group subGroup = group.getGroup("Links", 0);
      if (!prunePaths.contains("Links.Backward")) {
        assertEquals(2l, subGroup.getLong("Backward", 0));
      }
      if (!prunePaths.contains("Links.Forward")) {
        assertEquals(3l, subGroup.getLong("Forward", 0));
      }
    }
  }
  reader.close();
}

Example #4

Source File: ParquetResolverTest.java From pxf with Apache License 2.0

6 votes

@SuppressWarnings("deprecation")
private List<Group> readParquetFile(String file, long expectedSize, MessageType schema) throws IOException {
    List<Group> result = new ArrayList<>();
    String parquetFile = Objects.requireNonNull(getClass().getClassLoader().getResource("parquet/" + file)).getPath();
    Path path = new Path(parquetFile);

    ParquetFileReader fileReader = new ParquetFileReader(new Configuration(), path, ParquetMetadataConverter.NO_FILTER);
    PageReadStore rowGroup;
    while ((rowGroup = fileReader.readNextRowGroup()) != null) {
        MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
        RecordReader<Group> recordReader = columnIO.getRecordReader(rowGroup, new GroupRecordConverter(schema));
        long rowCount = rowGroup.getRowCount();
        for (long i = 0; i < rowCount; i++) {
            result.add(recordReader.read());
        }
    }
    fileReader.close();
    assertEquals(expectedSize, result.size());
    return result;
}

Example #5

Source File: SqlInterpreterTest.java From zeppelin with Apache License 2.0

6 votes

public File createParquetFile(int[] values,
                              ParquetProperties.WriterVersion version) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".par");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType(
          "message test { "
                  + "required int32 int32_field; "
                  + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(
          path,
          new GroupWriteSupport(),
          CompressionCodecName.UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
  for (int i = 0; i < values.length; i++) {
    writer.write(f.newGroup()
            .append("int32_field", values[i]));
  }
  writer.close();
  return file;
}

Example #6

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

6 votes

public static void writeAndTest(WriteContext context) throws IOException {
  // Create the configuration, and then apply the schema to our configuration.
  Configuration configuration = new Configuration();
  GroupWriteSupport.setSchema(context.schema, configuration);
  GroupWriteSupport groupWriteSupport = new GroupWriteSupport();

  // Create the writer properties
  final int blockSize = context.blockSize;
  final int pageSize = context.pageSize;
  final int dictionaryPageSize = pageSize;
  final boolean enableDictionary = context.enableDictionary;
  final boolean enableValidation = context.enableValidation;
  ParquetProperties.WriterVersion writerVersion = context.version;
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  ParquetWriter<Group> writer = new ParquetWriter<Group>(context.fsPath,
      groupWriteSupport, codec, blockSize, pageSize, dictionaryPageSize,
      enableDictionary, enableValidation, writerVersion, configuration);

  context.write(writer);
  writer.close();

  context.test();

  context.path.delete();
}

Example #7

Source File: ParquetHdfsDataWriterTest.java From incubator-gobblin with Apache License 2.0

6 votes

protected List<TestRecord> readParquetFilesGroup(File outputFile)
    throws IOException {
  ParquetReader<Group> reader = null;
  List<Group> records = new ArrayList<>();
  try {
    reader = new ParquetReader<>(new Path(outputFile.toString()), new SimpleReadSupport());
    for (Group value = reader.read(); value != null; value = reader.read()) {
      records.add(value);
    }
  } finally {
    if (reader != null) {
      try {
        reader.close();
      } catch (Exception ex) {
        System.out.println(ex.getMessage());
      }
    }
  }
  return records.stream().map(value -> new TestRecord(
      value.getInteger(TestConstants.PARTITION_FIELD_NAME, 0),
      value.getLong(TestConstants.SEQUENCE_FIELD_NAME, 0),
      value.getString(TestConstants.PAYLOAD_FIELD_NAME, 0)
  )).collect(Collectors.toList());
}

Example #8

Source File: FilteringBenchmarks.java From parquet-mr with Apache License 2.0

6 votes

@Setup
public void writeFile() throws IOException {
  WriteConfigurator writeConfigurator = getWriteConfigurator();
  file = new Path(
      Files.createTempFile("benchmark-filtering_" + characteristic + '_' + writeConfigurator + '_', ".parquet")
          .toAbsolutePath().toString());
  long[] data = generateData();
  characteristic.arrangeData(data);
  try (ParquetWriter<Group> writer = writeConfigurator.configureBuilder(ExampleParquetWriter.builder(file)
      .config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString())
      .withRowGroupSize(Integer.MAX_VALUE) // Ensure to have one row-group per file only
      .withWriteMode(OVERWRITE))
      .build()) {
    for (long value : data) {
      Group group = new SimpleGroup(SCHEMA);
      group.add(0, value);
      group.add(1, Binary.fromString(dummyGenerator.nextString()));
      group.add(2, Binary.fromString(dummyGenerator.nextString()));
      group.add(3, Binary.fromString(dummyGenerator.nextString()));
      group.add(4, Binary.fromString(dummyGenerator.nextString()));
      group.add(5, Binary.fromString(dummyGenerator.nextString()));
      writer.write(group);
    }
  }
}

Example #9

Source File: PageChecksumReadBenchmarks.java From parquet-mr with Apache License 2.0

6 votes

private void readFile(Path file, int nRows, boolean verifyChecksums, Blackhole blackhole)
  throws IOException {
  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file)
      .withConf(configuration)
      .usePageChecksumVerification(verifyChecksums)
      .build()) {
    for (int i = 0; i < nRows; i++) {
      Group group = reader.read();
      blackhole.consume(group.getLong("long_field", 0));
      blackhole.consume(group.getBinary("binary_field", 0));
      Group subgroup = group.getGroup("group", 0);
      blackhole.consume(subgroup.getInteger("int_field", 0));
      blackhole.consume(subgroup.getInteger("int_field", 1));
      blackhole.consume(subgroup.getInteger("int_field", 2));
      blackhole.consume(subgroup.getInteger("int_field", 3));
    }
  }
}

Example #10

Source File: TestColumnIO.java From parquet-mr with Apache License 2.0

6 votes

private void validateGroups(List<Group> groups1, Object[][] e1) {
  Iterator<Group> i1 = groups1.iterator();
  for (int i = 0; i < e1.length; i++) {
    Object[] objects = e1[i];
    Group next = i1.next();
    for (int j = 0; j < objects.length; j++) {
      Object object = objects[j];
      if (object == null) {
        assertEquals(0, next.getFieldRepetitionCount(j));
      } else {
        assertEquals("looking for r[" + i + "][" + j + "][0]=" + object, 1, next.getFieldRepetitionCount(j));
        assertEquals(object, next.getInteger(j, 0));
      }
    }
  }
}

Example #11

Source File: TestThriftToParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testWriteFile() throws IOException, InterruptedException, TException {
  final AddressBook a = new AddressBook(
      Arrays.asList(
          new Person(
              new Name("Bob", "Roberts"),
              0,
              "[email protected]",
              Arrays.asList(new PhoneNumber("1234567890")))));

  final Path fileToCreate = createFile(a);

  ParquetReader<Group> reader = createRecordReader(fileToCreate);

  Group g = null;
  int i = 0;
  while((g = reader.read()) != null) {
    assertEquals(a.persons.size(), g.getFieldRepetitionCount("persons"));
    assertEquals(a.persons.get(0).email, g.getGroup("persons", 0).getGroup(0, 0).getString("email", 0));
    // just some sanity check, we're testing the various layers somewhere else
    ++i;
  }
  assertEquals("read 1 record", 1, i);

}

Example #12

Source File: TestColumnIO.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testOneOfEach() {
  MessageType oneOfEachSchema = MessageTypeParser.parseMessageType(oneOfEach);
  GroupFactory gf = new SimpleGroupFactory(oneOfEachSchema);
  Group g1 = gf.newGroup()
      .append("a", 1l)
      .append("b", 2)
      .append("c", 3.0f)
      .append("d", 4.0d)
      .append("e", true)
      .append("f", Binary.fromString("6"))
      .append("g", new NanoTime(1234, System.currentTimeMillis() * 1000))
      .append("h", Binary.fromString("abc"));

  testSchema(oneOfEachSchema, Arrays.asList(g1));
}

Example #13

Source File: DictionaryFilterTest.java From parquet-mr with Apache License 2.0

5 votes

private static void prepareFile(WriterVersion version, Path file) throws IOException {
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);
  ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .withWriterVersion(version)
      .withCompressionCodec(GZIP)
      .withRowGroupSize(1024*1024)
      .withPageSize(1024)
      .enableDictionaryEncoding()
      .withDictionaryPageSize(2*1024)
      .withConf(conf)
      .build();
  writeData(f, writer);
}

Example #14

Source File: TestColumnIndexes.java From parquet-mr with Apache License 2.0

5 votes

private Group createGroup(List<Supplier<?>> generators, Random random) {
  Group group = FACTORY.newGroup();
  for (int column = 0, columnCnt = SCHEMA.getFieldCount(); column < columnCnt; ++column) {
    Type type = SCHEMA.getType(column);
    Supplier<?> generator = generators.get(column);
    // 2% chance of null value for an optional column
    if (generator == null || (type.isRepetition(OPTIONAL) && random.nextInt(50) == 0)) {
      continue;
    }
    switch (type.asPrimitiveType().getPrimitiveTypeName()) {
    case BINARY:
    case FIXED_LEN_BYTE_ARRAY:
    case INT96:
      group.append(type.getName(), (Binary) generator.get());
      break;
    case INT32:
      group.append(type.getName(), (Integer) generator.get());
      break;
    case INT64:
      group.append(type.getName(), (Long) generator.get());
      break;
    case FLOAT:
      group.append(type.getName(), (Float) generator.get());
      break;
    case DOUBLE:
      group.append(type.getName(), (Double) generator.get());
      break;
    case BOOLEAN:
      group.append(type.getName(), (Boolean) generator.get());
      break;
    }
  }
  return group;
}

Example #15

Source File: TestFiltered.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testFilterOnString() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // First try matching against the A url in record 1
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://A"))));

  readOne(recordReader, "r2 filtered out", r1);

  // Second try matching against the B url in record 1 - it should fail as we only match
  // against the first instance of a
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://B"))));

  List<Group> all = readAll(recordReader);
  assertEquals("There should be no matching records: " + all , 0, all.size());

  // Finally try matching against the C url in record 2
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://C"))));

  readOne(recordReader, "r1 filtered out", r2);

}

Example #16

Source File: TestRecordLevelFilters.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testAllFilter() throws Exception {
  BinaryColumn name = binaryColumn("name");

  FilterPredicate pred = eq(name, Binary.fromString("no matches"));

  List<Group> found = PhoneBookWriter.readFile(phonebookFile, FilterCompat.get(pred));
  assertEquals(new ArrayList<Group>(), found);
}

Example #17

Source File: TestFiltered.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testApplyFunctionFilterOnString() {
  MessageColumnIO columnIO =  new ColumnIOFactory(true).getColumnIO(schema);
  MemPageStore memPageStore = writeTestRecords(columnIO, 1);

  // First try matching against the A url in record 1
  RecordMaterializer<Group> recordConverter = new GroupRecordConverter(schema);
  RecordReaderImplementation<Group> recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", applyFunctionToString(new StringEndsWithAPredicate()))));

  readOne(recordReader, "r2 filtered out", r1);

  // Second try matching against the B url in record 1 - it should fail as we only match
  // against the first instance of a
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://B"))));

  List<Group> all = readAll(recordReader);
  assertEquals("There should be no matching records: " + all , 0, all.size());

  // Finally try matching against the C url in record 2
  recordReader = (RecordReaderImplementation<Group>)
      columnIO.getRecordReader(memPageStore, recordConverter,
          FilterCompat.get(column("Name.Url", equalTo("http://C"))));

  readOne(recordReader, "r1 filtered out", r2);

}

Example #18

Source File: TestMultipleWriteRead.java From parquet-mr with Apache License 2.0

5 votes

private void validateFile(Path file, List<Group> data) throws IOException {
  try (ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), file).build()) {
    for (Group group : data) {
      assertEquals(group.toString(), reader.read().toString());
    }
  }
}

Example #19

Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0

5 votes

private void validateColumns(String file, int numRecord, TestDocs testDocs) throws IOException {
  ParquetReader<Group> reader = ParquetReader.builder(new GroupReadSupport(), new Path(file)).withConf(conf).build();
  for (int i = 0; i < numRecord; i++) {
    Group group = reader.read();
    assertTrue(group.getLong("DocId", 0) == testDocs.docId[i]);
    assertArrayEquals(group.getBinary("Name", 0).getBytes(), testDocs.name[i].getBytes());
    assertArrayEquals(group.getBinary("Gender", 0).getBytes(), testDocs.gender[i].getBytes());
    Group subGroup = group.getGroup("Links", 0);
    assertArrayEquals(subGroup.getBinary("Backward", 0).getBytes(), testDocs.linkBackward[i].getBytes());
    assertArrayEquals(subGroup.getBinary("Forward", 0).getBytes(), testDocs.linkForward[i].getBytes());
  }
  reader.close();
}

Example #20

Source File: TestParquetWriterAppendBlocks.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testBasicBehavior() throws IOException {
  Path combinedFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, FILE_SCHEMA, combinedFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  LinkedList<Group> expected = new LinkedList<Group>();
  expected.addAll(file1content);
  expected.addAll(file2content);

  ParquetReader<Group> reader = ParquetReader
      .builder(new GroupReadSupport(), combinedFile)
      .build();

  Group next;
  while ((next = reader.read()) != null) {
    Group expectedNext = expected.removeFirst();
    // check each value; equals is not supported for simple records
    Assert.assertEquals("Each id should match",
        expectedNext.getInteger("id", 0), next.getInteger("id", 0));
    Assert.assertEquals("Each string should match",
        expectedNext.getString("string", 0), next.getString("string", 0));
  }

  Assert.assertEquals("All records should be present", 0, expected.size());
}

Example #21

Source File: FilteringBenchmarks.java From parquet-mr with Apache License 2.0

5 votes

private void benchmark(Blackhole blackhole, BaseContext context) throws Exception {
  FilterPredicate filter = FilterApi.eq(BaseContext.COLUMN, context.getRandom().nextLong());
  try (ParquetReader<Group> reader = context.createReaderBuilder()
      .withFilter(FilterCompat.get(filter))
      .build()) {
    blackhole.consume(reader.read());
  }
}

Example #22

Source File: ParquetResolverTest.java From pxf with Apache License 2.0

5 votes

@Test
public void testGetFields_Primitive_RepeatedString() throws IOException {
    List<Type> columns = new ArrayList<>();
    columns.add(new PrimitiveType(Type.Repetition.REPEATED, PrimitiveTypeName.BINARY, "myString", OriginalType.UTF8));
    schema = new MessageType("TestProtobuf.StringArray", columns);
    context.setMetadata(schema);
    context.setTupleDescription(getColumnDescriptorsFromSchema(schema));
    resolver.initialize(context);

    List<Group> groups = readParquetFile("proto-repeated-string.parquet", 3, schema);
    List<OneField> fields;

    // row 0
    fields = assertRow(groups, 0, 1);
    assertEquals(DataType.TEXT.getOID(), fields.get(0).type);
    assertEquals("[\"hello\",\"world\"]", fields.get(0).val);

    // row 1
    fields = assertRow(groups, 1, 1);
    assertEquals(DataType.TEXT.getOID(), fields.get(0).type);
    assertEquals("[\"good\",\"bye\"]", fields.get(0).val);

    // row 2
    fields = assertRow(groups, 2, 1);
    assertEquals(DataType.TEXT.getOID(), fields.get(0).type);
    assertEquals("[\"one\",\"two\",\"three\"]", fields.get(0).val);

}

Example #23

Source File: TestTupleRecordConsumer.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testMaps() throws ExecException, ParserException {
      String pigSchemaString = "a: [(b: chararray)]";
  SimpleGroup g = new SimpleGroup(getMessageType(pigSchemaString));
  Group map = g.addGroup("a");
  map.addGroup("map").append("key", "foo").addGroup("value").append("b", "foo");
  map.addGroup("map").append("key", "bar").addGroup("value").append("b", "bar");

  testFromGroups(pigSchemaString, Arrays.<Group>asList(g));
}

Example #24

Source File: PageChecksumDataGenerator.java From parquet-mr with Apache License 2.0

5 votes

public void generateData(Path outFile, int nRows, boolean writeChecksums,
                         CompressionCodecName compression) throws IOException {
  if (exists(configuration, outFile)) {
    System.out.println("File already exists " + outFile);
    return;
  }

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(outFile)
    .withConf(configuration)
    .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
    .withCompressionCodec(compression)
    .withDictionaryEncoding(true)
    .withType(SCHEMA)
    .withPageWriteChecksumEnabled(writeChecksums)
    .build();

  GroupFactory groupFactory = new SimpleGroupFactory(SCHEMA);
  Random rand = new Random(42);
  for (int i = 0; i < nRows; i++) {
    Group group = groupFactory.newGroup();
    group
      .append("long_field", (long) i)
      .append("binary_field", randomUUID().toString())
      .addGroup("group")
      // Force dictionary encoding by performing modulo
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100);
    writer.write(group);
  }

  writer.close();
}

Example #25

Source File: TestSimpleRecordConverter.java From parquet-mr with Apache License 2.0

5 votes

private void createTestParquetFile() throws IOException {
  Path fsPath = new Path(testFile().getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    writer.write(fact.newGroup()
     .append(INT32_FIELD, 32)
     .append(INT64_FIELD, 64L)
     .append(FLOAT_FIELD, 1.0f)
     .append(DOUBLE_FIELD, 2.0d)
     .append(BINARY_FIELD, Binary.fromString("foobar"))
     .append(FIXED_LEN_BYTE_ARRAY_FIELD,
       Binary.fromConstantByteArray(new byte[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 })));
  }
}

Example #26

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

5 votes

private String createParquetFile(String prefix) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, INT64, "Backward"),
      new PrimitiveType(REPEATED, INT64, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", 1l);
      g.add("Name", "foo");
      g.add("Gender", "male");
      Group links = g.addGroup("Links");
      links.add(0, 2l);
      links.add(1, 3l);
      writer.write(g);
    }
  }

  return file;
}

Example #27

Source File: ParquetFileAccessor.java From pxf with Apache License 2.0

5 votes

/**
 * Reads the next record.
 *
 * @return one record or null when split is already exhausted
 * @throws IOException if unable to read
 */
@Override
public OneRow readNextObject() throws IOException {
    final long then = System.nanoTime();
    Group group = fileReader.read();
    final long nanos = System.nanoTime() - then;
    totalReadTimeInNanos += nanos;

    if (group != null) {
        rowsRead++;
        return new OneRow(null, group);
    }
    return null;
}

Example #28

Source File: TestZstandardCodec.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void map(LongWritable key, Text value, OutputCollector<Void, Group> outputCollector, Reporter reporter) throws IOException {
  Group group = factory.newGroup()
    .append("line", (int) key.get())
    .append("content", value.toString());
  outputCollector.collect(null, group);
}

Example #29

Source File: TestConstants.java From incubator-gobblin with Apache License 2.0

5 votes

@Override
public Group convertToParquetGroup(TestRecord record) {
  Group group = new SimpleGroup(PARQUET_SCHEMA);
  group.add(PAYLOAD_FIELD_NAME, record.getPayload());
  group.add(SEQUENCE_FIELD_NAME, Long.valueOf(record.getSequence()));
  group.add(PARTITION_FIELD_NAME, record.getPartition());
  return group;
}

Example #30

Source File: ParquetFileTest.java From parquet-mr with Apache License 2.0

5 votes

private void createTestParquetFile() throws IOException {
  File file = parquetFile();
  Path fsPath = new Path(file.getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    for (int i = 0; i < 10; i++) {
      final byte[] bytes = new byte[12];
      ThreadLocalRandom.current().nextBytes(bytes);

      writer.write(fact.newGroup()
       .append(INT32_FIELD, 32 + i)
       .append(INT64_FIELD, 64L + i)
       .append(FLOAT_FIELD, 1.0f + i)
       .append(DOUBLE_FIELD, 2.0d + i)
       .append(BINARY_FIELD, Binary.fromString(COLORS[i % COLORS.length]))
       .append(FIXED_LEN_BYTE_ARRAY_FIELD,
         Binary.fromConstantByteArray(bytes)));
    }
  }
}