Java Code Examples for org.apache.parquet.example.data.Group#add()

The following examples show how to use org.apache.parquet.example.data.Group#add() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FilteringBenchmarks.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Setup
public void writeFile() throws IOException {
  WriteConfigurator writeConfigurator = getWriteConfigurator();
  file = new Path(
      Files.createTempFile("benchmark-filtering_" + characteristic + '_' + writeConfigurator + '_', ".parquet")
          .toAbsolutePath().toString());
  long[] data = generateData();
  characteristic.arrangeData(data);
  try (ParquetWriter<Group> writer = writeConfigurator.configureBuilder(ExampleParquetWriter.builder(file)
      .config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, SCHEMA.toString())
      .withRowGroupSize(Integer.MAX_VALUE) // Ensure to have one row-group per file only
      .withWriteMode(OVERWRITE))
      .build()) {
    for (long value : data) {
      Group group = new SimpleGroup(SCHEMA);
      group.add(0, value);
      group.add(1, Binary.fromString(dummyGenerator.nextString()));
      group.add(2, Binary.fromString(dummyGenerator.nextString()));
      group.add(3, Binary.fromString(dummyGenerator.nextString()));
      group.add(4, Binary.fromString(dummyGenerator.nextString()));
      group.add(5, Binary.fromString(dummyGenerator.nextString()));
      writer.write(group);
    }
  }
}
 
Example 2
Source File: TestTupleRecordConsumer.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void testArtSchema() throws ExecException, ParserException {

  String pigSchemaString =
          "DocId:long, " +
          "Links:(Backward:{(long)}, Forward:{(long)}), " +
          "Name:{(Language:{(Code:chararray,Country:chararray)}, Url:chararray)}";

  SimpleGroup g = new SimpleGroup(getMessageType(pigSchemaString));
  g.add("DocId", 1l);
  Group links = g.addGroup("Links");
  links.addGroup("Backward").addGroup("bag").add(0, 1l);
  links.addGroup("Forward").addGroup("bag").add(0, 1l);
  Group name = g.addGroup("Name").addGroup("bag");
  name.addGroup("Language").addGroup("bag").append("Code", "en").append("Country", "US");
  name.add("Url", "http://foo/bar");

  testFromGroups(pigSchemaString, Arrays.<Group>asList(g));
}
 
Example 3
Source File: TestConstants.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@Override
public Group convertToParquetGroup(TestRecord record) {
  Group group = new SimpleGroup(PARQUET_SCHEMA);
  group.add(PAYLOAD_FIELD_NAME, record.getPayload());
  group.add(SEQUENCE_FIELD_NAME, Long.valueOf(record.getSequence()));
  group.add(PARTITION_FIELD_NAME, record.getPartition());
  return group;
}
 
Example 4
Source File: TestPruneColumnsCommand.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private String createParquetFile(String prefix) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, INT64, "Backward"),
      new PrimitiveType(REPEATED, INT64, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file)).withConf(conf);
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", 1l);
      g.add("Name", "foo");
      g.add("Gender", "male");
      Group links = g.addGroup("Links");
      links.add(0, 2l);
      links.add(1, 3l);
      writer.write(g);
    }
  }

  return file;
}
 
Example 5
Source File: TestParquetWriterAppendBlocks.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Before
public void createSourceData() throws IOException {
  this.file1 = newTemp();
  this.file2 = newTemp();

  ParquetWriter<Group> writer1 = ExampleParquetWriter.builder(file1)
      .withType(FILE_SCHEMA)
      .build();
  ParquetWriter<Group> writer2 = ExampleParquetWriter.builder(file2)
      .withType(FILE_SCHEMA)
      .build();

  for (int i = 0; i < FILE_SIZE; i += 1) {
    Group group1 = GROUP_FACTORY.newGroup();
    group1.add("id", i);
    group1.add("string", UUID.randomUUID().toString());
    writer1.write(group1);
    file1content.add(group1);

    Group group2 = GROUP_FACTORY.newGroup();
    group2.add("id", FILE_SIZE+i);
    group2.add("string", UUID.randomUUID().toString());
    writer2.write(group2);
    file2content.add(group2);
  }

  writer1.close();
  writer2.close();
}
 
Example 6
Source File: TestInputOutputFormatWithPadding.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
protected void map(LongWritable key, Text value, Context context)
    throws IOException, InterruptedException {
  // writes each character of the line with a UUID
  String line = value.toString();
  for (int i = 0; i < line.length(); i += 1) {
    Group group = GROUP_FACTORY.newGroup();
    group.add(0, Binary.fromString(UUID.randomUUID().toString()));
    group.add(1, Binary.fromString(line.substring(i, i+1)));
    context.write(null, group);
  }
}
 
Example 7
Source File: TestInputFormatColumnProjection.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
protected void map(LongWritable key, Text value, Context context)
    throws IOException, InterruptedException {
  // writes each character of the line with a UUID
  String line = value.toString();
  for (int i = 0; i < line.length(); i += 1) {
    Group group = GROUP_FACTORY.newGroup();
    group.add(0, Binary.fromString(UUID.randomUUID().toString()));
    group.add(1, Binary.fromString(line.substring(i, i+1)));
    context.write(null, group);
  }
}
 
Example 8
Source File: TestMultipleWriteRead.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public Group get() {
  Group group = factory.newGroup();
  group.add("id", random.nextInt());
  group.add("name", getString(NAME_MIN_SIZE, NAME_MAX_SIZE));
  Group phoneNumbers = group.addGroup("phone_numbers");
  for (int i = 0, n = random.nextInt(PHONE_NUMBERS_MAX_SIZE); i < n; ++i) {
    Group phoneNumber = phoneNumbers.addGroup(0);
    phoneNumber.add(0, random.nextLong() % (MAX_PHONE_NUMBER - MIN_PHONE_NUMBER) + MIN_PHONE_NUMBER);
  }
  if (random.nextDouble() >= COMMENT_NULL_RATIO) {
    group.add("comment", getString(0, COMMENT_MAX_SIZE));
  }
  return group;
}
 
Example 9
Source File: CompressionConveterTest.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private String createParquetFile(Configuration conf, Map<String, String> extraMeta, int numRecord, String prefix, String codec,
                                       ParquetProperties.WriterVersion writerVersion, int pageSize, TestDocs testDocs) throws IOException {
  MessageType schema = new MessageType("schema",
    new PrimitiveType(REQUIRED, INT64, "DocId"),
    new PrimitiveType(REQUIRED, BINARY, "Name"),
    new PrimitiveType(REQUIRED, BINARY, "Gender"),
    new GroupType(OPTIONAL, "Links",
      new PrimitiveType(REPEATED, BINARY, "Backward"),
      new PrimitiveType(REPEATED, BINARY, "Forward")));

  conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());

  String file = createTempFile(prefix);
  ExampleParquetWriter.Builder builder = ExampleParquetWriter.builder(new Path(file))
    .withConf(conf)
    .withWriterVersion(writerVersion)
    .withExtraMetaData(extraMeta)
    .withDictionaryEncoding("DocId", true)
    .withValidation(true)
    .enablePageWriteChecksum()
    .withPageSize(pageSize)
    .withCompressionCodec(CompressionCodecName.valueOf(codec));
  try (ParquetWriter writer = builder.build()) {
    for (int i = 0; i < numRecord; i++) {
      SimpleGroup g = new SimpleGroup(schema);
      g.add("DocId", testDocs.docId[i]);
      g.add("Name", testDocs.name[i]);
      g.add("Gender", testDocs.gender[i]);
      Group links = g.addGroup("Links");
      links.add(0, testDocs.linkBackward[i]);
      links.add(1, testDocs.linkForward[i]);
      writer.write(g);
    }
  }

  return file;
}
 
Example 10
Source File: TestFiltersWithMissingColumns.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Before
public void createDataFile() throws Exception {
  File file = temp.newFile("test.parquet");
  this.path = new Path(file.toString());

  MessageType type = Types.buildMessage()
      .required(INT64).named("id")
      .required(BINARY).as(UTF8).named("data")
      .named("test");

  SimpleGroupFactory factory = new SimpleGroupFactory(type);

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
      .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
      .withType(type)
      .build();

  try {
    for (long i = 0; i < 1000; i += 1) {
      Group g = factory.newGroup();
      g.add(0, i);
      g.add(1, "data-" + i);
      writer.write(g);
    }
  } finally {
    writer.close();
  }
}
 
Example 11
Source File: ParquetResolver.java    From pxf with Apache License 2.0 4 votes vote down vote up
private void fillGroup(int index, OneField field, Group group, Type type) throws IOException {
    if (field.val == null)
        return;
    switch (type.asPrimitiveType().getPrimitiveTypeName()) {
        case BINARY:
            if (type.getOriginalType() == OriginalType.UTF8)
                group.add(index, (String) field.val);
            else
                group.add(index, Binary.fromReusedByteArray((byte[]) field.val));
            break;
        case INT32:
            if (type.getOriginalType() == OriginalType.INT_16)
                group.add(index, (Short) field.val);
            else
                group.add(index, (Integer) field.val);
            break;
        case INT64:
            group.add(index, (Long) field.val);
            break;
        case DOUBLE:
            group.add(index, (Double) field.val);
            break;
        case FLOAT:
            group.add(index, (Float) field.val);
            break;
        case FIXED_LEN_BYTE_ARRAY:
            // From org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter.DecimalDataWriter#decimalToBinary
            String value = (String) field.val;
            int precision = Math.min(HiveDecimal.MAX_PRECISION, type.asPrimitiveType().getDecimalMetadata().getPrecision());
            int scale = Math.min(HiveDecimal.MAX_SCALE, type.asPrimitiveType().getDecimalMetadata().getScale());
            HiveDecimal hiveDecimal = HiveDecimal.enforcePrecisionScale(
                    HiveDecimal.create(value),
                    precision,
                    scale);

            if (hiveDecimal == null) {
                // When precision is higher than HiveDecimal.MAX_PRECISION
                // and enforcePrecisionScale returns null, it means we
                // cannot store the value in Parquet because we have
                // exceeded the precision. To make the behavior consistent
                // with Hive's behavior when storing on a Parquet-backed
                // table, we store the value as null.
                return;
            }

            byte[] decimalBytes = hiveDecimal.bigIntegerBytesScaled(scale);

            // Estimated number of bytes needed.
            int precToBytes = ParquetFileAccessor.PRECISION_TO_BYTE_COUNT[precision - 1];
            if (precToBytes == decimalBytes.length) {
                // No padding needed.
                group.add(index, Binary.fromReusedByteArray(decimalBytes));
            } else {
                byte[] tgt = new byte[precToBytes];
                if (hiveDecimal.signum() == -1) {
                    // For negative number, initializing bits to 1
                    for (int i = 0; i < precToBytes; i++) {
                        tgt[i] |= 0xFF;
                    }
                }
                System.arraycopy(decimalBytes, 0, tgt, precToBytes - decimalBytes.length, decimalBytes.length); // Padding leading zeroes/ones.
                group.add(index, Binary.fromReusedByteArray(tgt));
            }
            // end -- org.apache.hadoop.hive.ql.io.parquet.write.DataWritableWriter.DecimalDataWriter#decimalToBinary
            break;
        case INT96:  // SQL standard timestamp string value with or without time zone literals: https://www.postgresql.org/docs/9.4/datatype-datetime.html
            String timestamp = (String) field.val;
            if (TIMESTAMP_PATTERN.matcher(timestamp).find()) {
                // Note: this conversion convert type "timestamp with time zone" will lose timezone information
                // while preserving the correct value. (as Parquet doesn't support timestamp with time zone.
                group.add(index, ParquetTypeConverter.getBinaryFromTimestampWithTimeZone(timestamp));
            } else {
                group.add(index, ParquetTypeConverter.getBinaryFromTimestamp(timestamp));
            }
            break;
        case BOOLEAN:
            group.add(index, (Boolean) field.val);
            break;
        default:
            throw new IOException("Not supported type " + type.asPrimitiveType().getPrimitiveTypeName());
    }
}
 
Example 12
Source File: ParquetResolverTest.java    From pxf with Apache License 2.0 4 votes vote down vote up
@Test
public void testGetFields_Primitive_Repeated_Synthetic() {
    // this test does not read the actual Parquet file, but rather construct Group object synthetically
    schema = getParquetSchemaForPrimitiveTypes(Type.Repetition.REPEATED, true);
    // schema has changed, set metadata again
    context.setMetadata(schema);
    context.setTupleDescription(getColumnDescriptorsFromSchema(schema));
    resolver.initialize(context);

    /*
    Corresponding DB column types  are:
    TEXT,TEXT,INTEGER, DOUBLE PRECISION,NUMERIC,TIMESTAMP,REAL,BIGINT,BOOLEAN,SMALLINT,SMALLINT,VARCHAR(5),CHAR(3),BYTEA
     */

    Group group = new SimpleGroup(schema);

    group.add(0, "row1-1");
    group.add(0, "row1-2");

    // leave column 1 (t2) unset as part fo the test

    group.add(2, 1);
    group.add(2, 2);
    group.add(2, 3);

    group.add(3, 6.0d);
    group.add(3, -16.34d);

    BigDecimal value = new BigDecimal("12345678.9012345987654321"); // place of dot doesn't matter
    byte fillByte = (byte) (value.signum() < 0 ? 0xFF : 0x00);
    byte[] unscaled = value.unscaledValue().toByteArray();
    byte[] bytes = new byte[16];
    int offset = bytes.length - unscaled.length;
    for (int i = 0; i < bytes.length; i += 1) {
        bytes[i] = (i < offset) ? fillByte : unscaled[i - offset];
    }
    group.add(4, Binary.fromReusedByteArray(bytes));

    group.add(5, ParquetTypeConverter.getBinaryFromTimestamp("2019-03-14 14:10:28"));
    group.add(5, ParquetTypeConverter.getBinaryFromTimestamp("1969-12-30 05:42:23.211211"));

    group.add(6, 7.7f);
    group.add(6, -12345.35354646f);

    group.add(7, 23456789L);
    group.add(7, -123456789012345L);

    group.add(8, true);
    group.add(8, false);

    group.add(9, (short) 1);
    group.add(9, (short) -3);

    group.add(10, (short) 269);
    group.add(10, (short) -313);

    group.add(11, Binary.fromString("Hello"));
    group.add(11, Binary.fromString("World"));

    group.add(12, Binary.fromString("foo"));
    group.add(12, Binary.fromString("bar"));

    byte[] byteArray1 = new byte[]{(byte) 49, (byte) 50, (byte) 51};
    group.add(13, Binary.fromReusedByteArray(byteArray1, 0, 3));
    byte[] byteArray2 = new byte[]{(byte) 52, (byte) 53, (byte) 54};
    group.add(13, Binary.fromReusedByteArray(byteArray2, 0, 3));

    group.add(14, ParquetTypeConverter.getBinaryFromTimestampWithTimeZone("2019-03-14 14:10:28+07"));
    OffsetDateTime offsetDateTime1 = OffsetDateTime.parse("2019-03-14T14:10:28+07:00");
    ZonedDateTime localDateTime1 = offsetDateTime1.atZoneSameInstant(ZoneId.systemDefault());
    String localDateTimeString1 = localDateTime1.format(DateTimeFormatter.ofPattern("[yyyy-MM-dd HH:mm:ss]"));

    group.add(15, ParquetTypeConverter.getBinaryFromTimestampWithTimeZone("2019-03-14 14:10:28-07:30"));
    OffsetDateTime offsetDateTime2 = OffsetDateTime.parse("2019-03-14T14:10:28-07:30");
    ZonedDateTime localDateTime2 = offsetDateTime2.atZoneSameInstant(ZoneId.systemDefault());
    String localDateTimeString2 = localDateTime2.format(DateTimeFormatter.ofPattern("[yyyy-MM-dd HH:mm:ss]"));


    List<Group> groups = new ArrayList<>();
    groups.add(group);
    List<OneField> fields = assertRow(groups, 0, 16);

    assertField(fields, 0, "[\"row1-1\",\"row1-2\"]", DataType.TEXT);
    assertField(fields, 1, "[]", DataType.TEXT);
    assertField(fields, 2, "[1,2,3]", DataType.TEXT);
    assertField(fields, 3, "[6.0,-16.34]", DataType.TEXT);
    assertField(fields, 4, "[123456.789012345987654321]", DataType.TEXT); // scale fixed to 18 in schema
    assertField(fields, 5, "[\"2019-03-14 14:10:28\",\"1969-12-30 05:42:23.211211\"]", DataType.TEXT);
    assertField(fields, 6, "[7.7,-12345.354]", DataType.TEXT); // rounded to the precision of 8
    assertField(fields, 7, "[23456789,-123456789012345]", DataType.TEXT);
    assertField(fields, 8, "[true,false]", DataType.TEXT);
    assertField(fields, 9, "[1,-3]", DataType.TEXT);
    assertField(fields, 10, "[269,-313]", DataType.TEXT);
    assertField(fields, 11, "[\"Hello\",\"World\"]", DataType.TEXT);
    assertField(fields, 12, "[\"foo\",\"bar\"]", DataType.TEXT); // 3 chars only
    Base64.Encoder encoder = Base64.getEncoder(); // byte arrays are Base64 encoded into strings
    String expectedByteArrays = "[\"" + encoder.encodeToString(byteArray1) + "\",\"" + encoder.encodeToString(byteArray2) + "\"]";
    assertField(fields, 13, expectedByteArrays, DataType.TEXT);
    assertField(fields, 14, "[\"" + localDateTimeString1 + "\"]", DataType.TEXT);
    assertField(fields, 15, "[\"" + localDateTimeString2 + "\"]", DataType.TEXT);
}
 
Example 13
Source File: ApacheParquet.java    From sylph with Apache License 2.0 4 votes vote down vote up
private void addValueToGroup(Class<?> dataType, Group group, int index, Object value)
{
    if (value == null || "".equals(value)) {
        return;
    }
    if (dataType == Binary.class) {
        group.add(index, value.toString());
    }
    else if (dataType == byte.class) {
        group.add(index, Byte.valueOf(value.toString()));
    }
    else if (dataType == short.class) {
        group.add(index, Short.valueOf(value.toString()));
    }
    else if (dataType == int.class) {
        group.add(index, Integer.valueOf(value.toString()));
    }
    else if (dataType == long.class) {
        group.add(index, Long.parseLong(value.toString()));
    }
    else if (dataType == double.class) {
        group.add(index, Double.valueOf(value.toString()));
    }
    else if (dataType == float.class) {
        group.add(index, Float.valueOf(value.toString()));
    }
    else if (dataType == Map.class) {
        int mapFieldSize = 0;
        //List<MessageType> mapSchemaList = mapEntrySchema.get(index);
        Group mapFieldGroup = new SimpleGroup(mapTopSchema);
        for (Map.Entry<String, Object> mapFieldEntry : ((Map<String, Object>) value)
                .entrySet()) {
            Group mapEntryKeyValueGroup = new SimpleGroup(kvSchema);
            final String key = mapFieldEntry.getKey();
            final Object vValue = mapFieldEntry.getValue();
            if (vValue != null) {
                mapEntryKeyValueGroup.add("key", key);
                mapFieldSize += key.length();
                mapEntryKeyValueGroup.add("value", vValue.toString());
                mapFieldSize += vValue.toString().length();
                mapFieldGroup.add("key_value", mapEntryKeyValueGroup);
            }
        }
        group.add(index, mapFieldGroup);
    }
    else {
        group.add(index, value.toString());
    }
}