parquet.example.data.Group Java Examples

The following examples show how to use parquet.example.data.Group. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetGroup.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
public String toString(String indent) {
  StringBuilder result = new StringBuilder();
  int i = 0;
  for (Type field : this.schema.getFields()) {
    String name = field.getName();
    List<Object> values = this.data[i];
    for (Object value : values) {
      result.append(indent).append(name);
      if (value == null) {
        result.append(": NULL\n");
      } else if (value instanceof Group) {
        result.append("\n").append(((ParquetGroup) value).toString(indent + "  "));
      } else {
        result.append(": ").append(value.toString()).append("\n");
      }
    }
    i++;
  }
  return result.toString();
}
 
Example #2
Source File: ParquetHdfsDataWriterTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
protected List<TestRecord> readParquetFilesGroup(File outputFile)
    throws IOException {
  ParquetReader<Group> reader = null;
  List<Group> records = new ArrayList<>();
  try {
    reader = new ParquetReader<>(new Path(outputFile.toString()), new SimpleReadSupport());
    for (Group value = reader.read(); value != null; value = reader.read()) {
      records.add(value);
    }
  } finally {
    if (reader != null) {
      try {
        reader.close();
      } catch (Exception ex) {
        System.out.println(ex.getMessage());
      }
    }
  }
  return records.stream().map(value -> new TestRecord(
      value.getInteger(TestConstants.PARTITION_FIELD_NAME, 0),
      value.getInteger(TestConstants.SEQUENCE_FIELD_NAME, 0),
      value.getString(TestConstants.PAYLOAD_FIELD_NAME, 0)
  )).collect(Collectors.toList());
}
 
Example #3
Source File: ParquetGroup.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
/**
 * Add any object of {@link PrimitiveType} or {@link Group} type with a String key.
 * @param key
 * @param object
 */
public void add(String key, Object object) {
  int fieldIndex = getIndex(key);
  if (object.getClass() == ParquetGroup.class) {
    this.addGroup(key, (Group) object);
  } else {
    this.add(fieldIndex, (Primitive) object);
  }
}
 
Example #4
Source File: JsonIntermediateToParquetGroupConverterTest.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
private void testCase(String testCaseName)
    throws SchemaConversionException, DataConversionException {
  JsonObject test = testCases.get(testCaseName).getAsJsonObject();
  parquetConverter = new JsonIntermediateToParquetGroupConverter();

  MessageType schema = parquetConverter.convertSchema(test.get("schema").getAsJsonArray(), workUnit);
  Group record =
      parquetConverter.convertRecord(schema, test.get("record").getAsJsonObject(), workUnit).iterator().next();
  assertEqualsIgnoreSpaces(schema.toString(), test.get("expectedSchema").getAsString());
  assertEqualsIgnoreSpaces(record.toString(), test.get("expectedRecord").getAsString());
}
 
Example #5
Source File: TestConstants.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@Override
public Group convertToParquetGroup(TestRecord record) {
  Group group = new SimpleGroup(PARQUET_SCHEMA);
  group.add(PAYLOAD_FIELD_NAME, record.getPayload());
  group.add(SEQUENCE_FIELD_NAME, Long.valueOf(record.getSequence()).intValue());
  group.add(PARTITION_FIELD_NAME, record.getPartition());
  return group;
}
 
Example #6
Source File: ExampleParquetMapReduce.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
@Override
public void map(Void key,
                Group value,
                Context context) throws IOException, InterruptedException {
  context.write(new Text(value.getString("symbol", 0)),
      new DoubleWritable(Double.valueOf(value.getValueToString(2, 0))));
}
 
Example #7
Source File: ExampleParquetMapReduce.java    From hiped2 with Apache License 2.0 5 votes vote down vote up
@Override
protected void reduce(Text key, Iterable<DoubleWritable> values, Context context) throws IOException, InterruptedException {
  Mean mean = new Mean();
  for (DoubleWritable val : values) {
    mean.increment(val.get());
  }
  Group group = factory.newGroup()
      .append("symbol", key.toString())
      .append("avg", mean.getResult());
  context.write(null, group);
}
 
Example #8
Source File: ParquetGroup.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Override
public Group addGroup(int fieldIndex) {
  ParquetGroup g = new ParquetGroup(this.schema.getType(fieldIndex).asGroupType());
  this.data[fieldIndex].add(g);
  return g;
}
 
Example #9
Source File: ParquetGroup.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
public Group getGroup(int fieldIndex, int index) {
  return (Group) this.getValue(fieldIndex, index);
}
 
Example #10
Source File: ParquetGroup.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
/**
 * Add a {@link Group} given a String key.
 * @param key
 * @param object
 */
private void addGroup(String key, Group object) {
  int fieldIndex = getIndex(key);
  this.schema.getType(fieldIndex).asGroupType();
  this.data[fieldIndex].add(object);
}
 
Example #11
Source File: JsonIntermediateToParquetGroupConverter.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Override
public Iterable<Group> convertRecord(MessageType outputSchema, JsonObject inputRecord, WorkUnitState workUnit)
    throws DataConversionException {
  return new SingleRecordIterable<>((Group) recordConverter.convert(inputRecord));
}
 
Example #12
Source File: ParquetDataWriterBuilder.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
/**
 * Build a version-specific {@link ParquetWriter} for given {@link ParquetWriterConfiguration}
 * @param writerConfiguration
 * @return
 * @throws IOException
 */
@Override
public ParquetWriterShim getVersionSpecificWriter(ParquetWriterConfiguration writerConfiguration)
    throws IOException {

  CompressionCodecName codecName = CompressionCodecName.fromConf(writerConfiguration.getCodecName());
  ParquetProperties.WriterVersion writerVersion = ParquetProperties.WriterVersion
      .fromString(writerConfiguration.getWriterVersion());

  Configuration conf = new Configuration();
  ParquetWriter versionSpecificWriter = null;
  switch (writerConfiguration.getRecordFormat()) {
    case GROUP: {
      GroupWriteSupport.setSchema((MessageType) this.schema, conf);
      WriteSupport support = new GroupWriteSupport();
      versionSpecificWriter = new ParquetWriter<Group>(
          writerConfiguration.getAbsoluteStagingFile(),
          support,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.getDictPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          writerConfiguration.isValidate(),
          writerVersion,
          conf);
      break;
    }
    case AVRO:  {
      versionSpecificWriter = new AvroParquetWriter(
          writerConfiguration.getAbsoluteStagingFile(),
          (Schema) this.schema,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          conf);
      break;
    }
    case PROTOBUF: {
      versionSpecificWriter = new ProtoParquetWriter(
          writerConfiguration.getAbsoluteStagingFile(),
          (Class<? extends Message>) this.schema,
          codecName,
          writerConfiguration.getBlockSize(),
          writerConfiguration.getPageSize(),
          writerConfiguration.isDictionaryEnabled(),
          writerConfiguration.isValidate());
      break;
    }
    default: throw new RuntimeException("Record format not supported");
  }
  ParquetWriter finalVersionSpecificWriter = versionSpecificWriter;

  return new ParquetWriterShim() {
    @Override
    public void write(Object record)
        throws IOException {
      finalVersionSpecificWriter.write(record);
    }

    @Override
    public void close()
        throws IOException {
      finalVersionSpecificWriter.close();
    }
  };
}
 
Example #13
Source File: ParquetHdfsDataWriterTest.java    From incubator-gobblin with Apache License 2.0 4 votes vote down vote up
@Override
public RecordMaterializer<Group> prepareForRead(Configuration conf, Map<String, String> metaData,
    MessageType schema, ReadContext context) {
  return new GroupRecordConverter(schema);
}
 
Example #14
Source File: TestReadParquet.java    From parquet-examples with Apache License 2.0 4 votes vote down vote up
@Override
public void map(LongWritable key, Group value, Context context) throws IOException, InterruptedException {
    NullWritable outKey = NullWritable.get();
    if(expectedFields == null) {
	// Get the file schema which may be different from the fields in a particular record) from the input split
	String fileSchema = ((ParquetInputSplit)context.getInputSplit()).getFileSchema();
	// System.err.println("file schema from context: " + fileSchema);
	RecordSchema schema = new RecordSchema(fileSchema);
	expectedFields = schema.getFields();
	//System.err.println("inferred schema: " + expectedFields.toString());
    }

    // No public accessor to the column values in a Group, so extract them from the string representation
    String line = value.toString();
    String[] fields = line.split("\n");

           StringBuilder csv = new StringBuilder();
    boolean hasContent = false;
    int i = 0;
    // Look for each expected column
    Iterator<FieldDescription> it = expectedFields.iterator();
    while(it.hasNext()) {
	if(hasContent ) {
	    csv.append(',');
	}
	String name = it.next().name;
	if(fields.length > i) {
	    String[] parts = fields[i].split(": ");
	    // We assume proper order, but there may be fields missing
	    if(parts[0].equals(name)) {
		boolean mustQuote = (parts[1].contains(",") || parts[1].contains("'"));
		if(mustQuote) {
		    csv.append('"');
		}
		csv.append(parts[1]);
		if(mustQuote) {
		    csv.append('"');
		}
		hasContent = true;
		i++;
	    }
	}
    }
    context.write(outKey, new Text(csv.toString()));
       }
 
Example #15
Source File: TestReadWriteParquet.java    From parquet-examples with Apache License 2.0 4 votes vote down vote up
@Override
public void map(LongWritable key, Group value, Context context) throws IOException, InterruptedException {
    context.write(null, value);
       }