Java Code Examples for org.apache.avro.file.DataFileWriter#create()

The following examples show how to use org.apache.avro.file.DataFileWriter#create() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public void open(WritableByteChannel channel) throws IOException {
  this.schema = new Schema.Parser().parse(getJsonSchema());
  DataFileWriter<?> writer;
  if (getRecordFormatter() == null) {
    writer = reflectWriter = new DataFileWriter<>(new ReflectDatumWriter<>(schema));
  } else {
    writer = genericWriter = new DataFileWriter<>(new GenericDatumWriter<>(schema));
  }
  writer.setCodec(getCodec().getCodec());
  for (Map.Entry<String, Object> entry : getMetadata().entrySet()) {
    Object v = entry.getValue();
    if (v instanceof String) {
      writer.setMeta(entry.getKey(), (String) v);
    } else if (v instanceof Long) {
      writer.setMeta(entry.getKey(), (Long) v);
    } else if (v instanceof byte[]) {
      writer.setMeta(entry.getKey(), (byte[]) v);
    } else {
      throw new IllegalStateException(
          "Metadata value type must be one of String, Long, or byte[]. Found "
              + v.getClass().getSimpleName());
    }
  }
  writer.create(schema, Channels.newOutputStream(channel));
}
 
Example 2
Source File: SmallFilesWrite.java    From hiped2 with Apache License 2.0 6 votes vote down vote up
public static void writeToAvro(File srcPath,
        OutputStream outputStream)
        throws IOException {
  DataFileWriter<Object> writer =
          new DataFileWriter<Object>(
              new GenericDatumWriter<Object>())
              .setSyncInterval(100);                 //<co id="ch02_smallfilewrite_comment2"/>
  writer.setCodec(CodecFactory.snappyCodec());   //<co id="ch02_smallfilewrite_comment3"/>
  writer.create(SCHEMA, outputStream);           //<co id="ch02_smallfilewrite_comment4"/>
  for (Object obj : FileUtils.listFiles(srcPath, null, false)) {
    File file = (File) obj;
    String filename = file.getAbsolutePath();
    byte content[] = FileUtils.readFileToByteArray(file);
    GenericRecord record = new GenericData.Record(SCHEMA);  //<co id="ch02_smallfilewrite_comment5"/>
    record.put(FIELD_FILENAME, filename);                   //<co id="ch02_smallfilewrite_comment6"/>
    record.put(FIELD_CONTENTS, ByteBuffer.wrap(content));   //<co id="ch02_smallfilewrite_comment7"/>
    writer.append(record);                                  //<co id="ch02_smallfilewrite_comment8"/>
    System.out.println(
            file.getAbsolutePath()
            + ": "
            + DigestUtils.md5Hex(content));
  }

  IOUtils.cleanup(null, writer);
  IOUtils.cleanup(null, outputStream);
}
 
Example 3
Source File: AvroFileGenerator.java    From flink-perf with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	// generate only avro file
	if (args.length == 2) {
		ordersPath = args[0];
		outputOrderAvroPath = args[1];
		// Generate file for avro test
		DatumWriter<Order> orderDatumWriter = new SpecificDatumWriter<Order>(Order.class);
		DataFileWriter<Order> dataFileWriter = new DataFileWriter<Order>(orderDatumWriter);
		dataFileWriter.create(Order.getClassSchema(), new File(outputOrderAvroPath));
		Scanner s = new Scanner(new File(ordersPath));
		while (s.hasNextLine()) {
			@SuppressWarnings("resource")
			Scanner lineScanner = new Scanner(s.nextLine()).useDelimiter("\\|");

			Order o = new Order();
			o.setOOrderkey(lineScanner.nextInt());
			o.setOCustkey(lineScanner.nextInt());
			o.setOOrderstatus(lineScanner.next());
			o.setOTotalprice(lineScanner.nextFloat());
			o.setOOrderdate(lineScanner.next());
			o.setOOrderpriority(lineScanner.next());
			o.setOClerk(lineScanner.next());
			o.setOShipproprity(lineScanner.nextInt());
			o.setOComment(lineScanner.next());
			dataFileWriter.append(o);
			lineScanner.close();
		}
		dataFileWriter.flush();
		s.close();
		dataFileWriter.close();
		return;
	} else {
		System.err.println("Usage: <inputFilePath> <outputAvroPath>");
		System.exit(1);
	}
}
 
Example 4
Source File: TestAvroExport.java    From aliyun-maxcompute-data-collectors with Apache License 2.0 5 votes vote down vote up
/**
 * Create a data file that gets exported to the db.
 * @param fileNum the number of the file (for multi-file export)
 * @param numRecords how many records to write to the file.
 */
protected void createAvroFile(int fileNum, int numRecords,
    ColumnGenerator... extraCols) throws IOException {

  Path tablePath = getTablePath();
  Path filePath = new Path(tablePath, "part" + fileNum);

  Configuration conf = new Configuration();
  if (!BaseSqoopTestCase.isOnPhysicalCluster()) {
    conf.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS);
  }
  FileSystem fs = FileSystem.get(conf);
  fs.mkdirs(tablePath);
  OutputStream os = fs.create(filePath);

  Schema schema = buildAvroSchema(extraCols);
  DatumWriter<GenericRecord> datumWriter =
    new GenericDatumWriter<GenericRecord>();
  DataFileWriter<GenericRecord> dataFileWriter =
    new DataFileWriter<GenericRecord>(datumWriter);
  dataFileWriter.create(schema, os);

  for (int i = 0; i < numRecords; i++) {
    GenericRecord record = new GenericData.Record(schema);
    record.put("id", i);
    record.put("msg", getMsgPrefix() + i);
    addExtraColumns(record, i, extraCols);
    dataFileWriter.append(record);
  }

  dataFileWriter.close();
  os.close();
}
 
Example 5
Source File: TimelineMetadataUtils.java    From hudi with Apache License 2.0 5 votes vote down vote up
public static <T extends SpecificRecordBase> Option<byte[]> serializeAvroMetadata(T metadata, Class<T> clazz)
    throws IOException {
  DatumWriter<T> datumWriter = new SpecificDatumWriter<>(clazz);
  DataFileWriter<T> fileWriter = new DataFileWriter<>(datumWriter);
  ByteArrayOutputStream baos = new ByteArrayOutputStream();
  fileWriter.create(metadata.getSchema(), baos);
  fileWriter.append(metadata);
  fileWriter.flush();
  return Option.of(baos.toByteArray());
}
 
Example 6
Source File: TestExtractAvroMetadata.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Test
public void testExtractionWithNonRecordSchema() throws IOException {
    final TestRunner runner = TestRunners.newTestRunner(new ExtractAvroMetadata());
    runner.setProperty(ExtractAvroMetadata.COUNT_ITEMS, "true");

    final Schema schema = new Schema.Parser().parse(new File("src/test/resources/array.avsc"));

    final GenericData.Array<String> data = new GenericData.Array<>(schema, Arrays.asList("one", "two", "three"));
    final DatumWriter<GenericData.Array<String>> datumWriter = new GenericDatumWriter<>(schema);

    final ByteArrayOutputStream out = new ByteArrayOutputStream();
    final DataFileWriter<GenericData.Array<String>> dataFileWriter = new DataFileWriter<>(datumWriter);
    dataFileWriter.create(schema, out);
    dataFileWriter.append(data);
    dataFileWriter.append(data);
    dataFileWriter.close();

    runner.enqueue(out.toByteArray());
    runner.run();

    runner.assertAllFlowFilesTransferred(ExtractAvroMetadata.REL_SUCCESS, 1);

    final MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExtractAvroMetadata.REL_SUCCESS).get(0);
    flowFile.assertAttributeExists(ExtractAvroMetadata.SCHEMA_FINGERPRINT_ATTR);
    flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_TYPE_ATTR, Schema.Type.ARRAY.getName());
    flowFile.assertAttributeEquals(ExtractAvroMetadata.SCHEMA_NAME_ATTR, "array");
    flowFile.assertAttributeEquals(ExtractAvroMetadata.ITEM_COUNT_ATTR, "2"); // number of arrays, not elements
}
 
Example 7
Source File: AvroTestTools.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
private void writeAsAvroBinary(Iterator<GenericRecord> input, Schema schema, FileSystem fs,
    Path outputPath) throws IOException {

  DataFileWriter writer = new DataFileWriter(new GenericDatumWriter());

  writer.create(schema, fs.create(outputPath, true));
  while (input.hasNext()) {
    writer.append(input.next());
  }
  writer.close();

  log.info("Successfully wrote avro file to path " + outputPath);
}
 
Example 8
Source File: TestMergeContent.java    From localization_nifi with Apache License 2.0 5 votes vote down vote up
private ByteArrayOutputStream serializeAvroRecord(Schema schema, GenericRecord user2, DatumWriter<GenericRecord> datumWriter) throws IOException {
    ByteArrayOutputStream out2 = new ByteArrayOutputStream();
    DataFileWriter<GenericRecord> dataFileWriter2 = new DataFileWriter<GenericRecord>(datumWriter);
    dataFileWriter2.create(schema, out2);
    dataFileWriter2.append(user2);
    dataFileWriter2.close();
    return out2;
}
 
Example 9
Source File: PigAvroOutputFormat.java    From Cubert with Apache License 2.0 5 votes vote down vote up
@Override
public RecordWriter<NullWritable, Object> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {

    if (schema == null)
        throw new IOException("Must provide a schema");

    Configuration conf = context.getConfiguration();

    DataFileWriter<Object> writer = new DataFileWriter<Object>(new PigAvroDatumWriter(schema));

    if (FileOutputFormat.getCompressOutput(context)) {
        int level = conf.getInt(DEFLATE_LEVEL_KEY, DEFAULT_DEFLATE_LEVEL);
        String codecName = conf.get(OUTPUT_CODEC, DEFLATE_CODEC);
        CodecFactory factory = codecName.equals(DEFLATE_CODEC)
            ? CodecFactory.deflateCodec(level)
            : CodecFactory.fromString(codecName);
        writer.setCodec(factory);
    }

    // Do max as core-default.xml has io.file.buffer.size as 4K
    writer.setSyncInterval(conf.getInt(SYNC_INTERVAL_KEY, Math.max(
            conf.getInt("io.file.buffer.size", DEFAULT_SYNC_INTERVAL), DEFAULT_SYNC_INTERVAL)));

    Path path = getDefaultWorkFile(context, EXT);
    writer.create(schema, path.getFileSystem(conf).create(path));
    return new PigAvroRecordWriter(writer);
}
 
Example 10
Source File: JsonToAvroConverter.java    From celos with Apache License 2.0 5 votes vote down vote up
@Override
public FixFile convert(TestRun tr, FixFile ff) throws Exception {
    Schema schema = new Schema.Parser().parse(schemaCreator.create(tr).getContent());
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    InputStream input = ff.getContent();
    DataFileWriter<Object> writer;;
    try {
        DatumReader<Object> reader = new GenericDatumReader<>(schema);
        DataInputStream din = new DataInputStream(input);
        writer = new DataFileWriter<>(new GenericDatumWriter<>());
        writer.create(schema, baos);
        Decoder decoder = DecoderFactory.get().jsonDecoder(schema, din);
        Object datum;
        while (true) {
            try {
                datum = reader.read(null, decoder);
            } catch (EOFException eofe) {
                break;
            }
            writer.append(datum);
        }
        writer.flush();
    } finally {
        input.close();
    }
    return new FixFile(new ByteArrayInputStream(baos.toByteArray()));
}
 
Example 11
Source File: TestConvertAvroToORC.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Test
public void test_onTrigger_routing_to_failure_fixed_type() throws Exception {
    String testString = "Hello!";
    GenericData.Record record = TestNiFiOrcUtils.buildAvroRecordWithFixed(testString);

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);
    fileWriter.flush();
    fileWriter.close();
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_FAILURE, 1);
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_FAILURE).get(0);
    assertEquals("test.avro", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));

    final InputStream in = new ByteArrayInputStream(resultFlowFile.toByteArray());
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        assertTrue(dataFileReader.hasNext());
        GenericRecord testedRecord = dataFileReader.next();

        assertNotNull(testedRecord.get("fixed"));
        assertArrayEquals(testString.getBytes(StandardCharsets.UTF_8), ((GenericData.Fixed) testedRecord.get("fixed")).bytes());
    }
}
 
Example 12
Source File: TestConvertAvroToJSON.java    From nifi with Apache License 2.0 5 votes vote down vote up
private ByteArrayOutputStream serializeAvroRecord(final Schema schema, final DatumWriter<GenericRecord> datumWriter, final GenericRecord... users) throws IOException {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter);
    dataFileWriter.create(schema, out);
    for (final GenericRecord user : users) {
        dataFileWriter.append(user);
    }

    dataFileWriter.close();
    return out;
}
 
Example 13
Source File: TestConvertAvroToORC.java    From nifi with Apache License 2.0 4 votes vote down vote up
@Test
public void test_onTrigger_primitive_record() throws Exception {
    GenericData.Record record = TestNiFiOrcUtils.buildPrimitiveAvroRecord(10, 20L, true, 30.0f, 40, StandardCharsets.UTF_8.encode("Hello"), "World");

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);
    // Put another record in
    record = TestNiFiOrcUtils.buildPrimitiveAvroRecord(1, 2L, false, 3.0f, 4L, StandardCharsets.UTF_8.encode("I am"), "another record");
    fileWriter.append(record);
    // And one more
    record = TestNiFiOrcUtils.buildPrimitiveAvroRecord(100, 200L, true, 300.0f, 400L, StandardCharsets.UTF_8.encode("Me"), "too!");
    fileWriter.append(record);
    fileWriter.flush();
    fileWriter.close();
    out.close();
    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test.avro");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);

    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS test_record (int INT, long BIGINT, boolean BOOLEAN, float FLOAT, double DOUBLE, bytes BINARY, string STRING)"
            + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("3", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    TypeInfo resultSchema = TestNiFiOrcUtils.buildPrimitiveOrcSchema();
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);

    // Check some fields in the first row
    Object intFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("int"));
    assertTrue(intFieldObject instanceof IntWritable);
    assertEquals(10, ((IntWritable) intFieldObject).get());
    Object stringFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("string"));
    assertTrue(stringFieldObject instanceof Text);
    assertEquals("World", stringFieldObject.toString());

}
 
Example 14
Source File: AvroFileInputOperatorTest.java    From attic-apex-malhar with Apache License 2.0 4 votes vote down vote up
private void writeAvroFile(File outputFile) throws IOException
{

  DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<GenericRecord>(
      new Schema.Parser().parse(AVRO_SCHEMA));

  DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter);
  dataFileWriter.create(new Schema.Parser().parse(AVRO_SCHEMA), outputFile);

  for (GenericRecord record : recordList) {
    dataFileWriter.append(record);
  }

  dataFileWriter.close();

  FileUtils.moveFileToDirectory(new File(outputFile.getAbsolutePath()), new File(testMeta.dir), true);

}
 
Example 15
Source File: TestConvertAvroToORC.java    From nifi with Apache License 2.0 4 votes vote down vote up
@Test
public void test_onTrigger_nested_complex_record() throws Exception {

    Map<String, List<Double>> mapData1 = new TreeMap<String, List<Double>>() {{
        put("key1", Arrays.asList(1.0, 2.0));
        put("key2", Arrays.asList(3.0, 4.0));
    }};

    Map<String, String> arrayMap11 = new TreeMap<String, String>() {{
        put("key1", "v1");
        put("key2", "v2");
    }};
    Map<String, String> arrayMap12 = new TreeMap<String, String>() {{
        put("key3", "v3");
        put("key4", "v4");
    }};

    GenericData.Record record = TestNiFiOrcUtils.buildNestedComplexAvroRecord(mapData1, Arrays.asList(arrayMap11, arrayMap12));

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);

    // Put another record in
    Map<String, List<Double>> mapData2 = new TreeMap<String, List<Double>>() {{
        put("key1", Arrays.asList(-1.0, -2.0));
        put("key2", Arrays.asList(-3.0, -4.0));
    }};

    Map<String, String> arrayMap21 = new TreeMap<String, String>() {{
        put("key1", "v-1");
        put("key2", "v-2");
    }};
    Map<String, String> arrayMap22 = new TreeMap<String, String>() {{
        put("key3", "v-3");
        put("key4", "v-4");
    }};

    record = TestNiFiOrcUtils.buildNestedComplexAvroRecord(mapData2, Arrays.asList(arrayMap21, arrayMap22));
    fileWriter.append(record);

    fileWriter.flush();
    fileWriter.close();
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);

    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS nested_complex_record " +
            "(myMapOfArray MAP<STRING, ARRAY<DOUBLE>>, myArrayOfMap ARRAY<MAP<STRING, STRING>>)"
            + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    TypeInfo resultSchema = TestNiFiOrcUtils.buildNestedComplexOrcSchema();
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);


    // check values
    Object myMapOfArray = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMapOfArray"));
    assertTrue(myMapOfArray instanceof Map);
    Map map = (Map) myMapOfArray;
    Object mapValue = map.get(new Text("key1"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof List);
    assertEquals(Arrays.asList(new DoubleWritable(1.0), new DoubleWritable(2.0)), mapValue);

    Object myArrayOfMap = inspector.getStructFieldData(o, inspector.getStructFieldRef("myArrayOfMap"));
    assertTrue(myArrayOfMap instanceof List);
    List list = (List) myArrayOfMap;
    Object el0 = list.get(0);
    assertNotNull(el0);
    assertTrue(el0 instanceof Map);
    assertEquals(new Text("v1"), ((Map) el0).get(new Text("key1")));
}
 
Example 16
Source File: TestConvertAvroToParquet.java    From nifi with Apache License 2.0 4 votes vote down vote up
@Before
public void setUp() throws Exception {
    processor = new ConvertAvroToParquet();
    runner = TestRunners.newTestRunner(processor);

    Schema schema = new Schema.Parser().parse(Resources.getResource("avro/all-minus-enum.avsc").openStream());

    DataFileWriter<Object> awriter = new DataFileWriter<Object>(new GenericDatumWriter<Object>());
    GenericData.Record nestedRecord = new GenericRecordBuilder(
            schema.getField("mynestedrecord").schema())
            .set("mynestedint", 1).build();

    GenericData.Record record = new GenericRecordBuilder(schema)
            .set("mynull", null)
            .set("myboolean", true)
            .set("myint", 1)
            .set("mylong", 2L)
            .set("myfloat", 3.1f)
            .set("mydouble", 4.1)
            .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8)))
            .set("mystring", "hello")
            .set("mynestedrecord", nestedRecord)
            .set("myarray", new GenericData.Array<Integer>(Schema.createArray(Schema.create(Schema.Type.INT)), Arrays.asList(1, 2)))
            .set("mymap", ImmutableMap.of("a", 1, "b", 2))
            .set("myfixed", new GenericData.Fixed(Schema.createFixed("ignored", null, null, 1), new byte[] { (byte) 65 }))
            .build();

    awriter.create(schema, tmpAvro);
    awriter.append(record);
    awriter.flush();
    awriter.close();

    DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema);
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(tmpAvro, datumReader);
    GenericRecord record1 = null;
    while (dataFileReader.hasNext()) {
        record1 = dataFileReader.next(record1);
        records.add(record1);
    }

}
 
Example 17
Source File: TestConvertAvroToORC.java    From localization_nifi with Apache License 2.0 4 votes vote down vote up
@Test
public void test_onTrigger_complex_record() throws Exception {

    Map<String, Double> mapData1 = new TreeMap<String, Double>() {{
        put("key1", 1.0);
        put("key2", 2.0);
    }};

    GenericData.Record record = TestNiFiOrcUtils.buildComplexAvroRecord(10, mapData1, "DEF", 3.0f, Arrays.asList(10, 20));

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);

    // Put another record in
    Map<String, Double> mapData2 = new TreeMap<String, Double>() {{
        put("key1", 3.0);
        put("key2", 4.0);
    }};

    record = TestNiFiOrcUtils.buildComplexAvroRecord(null, mapData2, "XYZ", 4L, Arrays.asList(100, 200));
    fileWriter.append(record);

    fileWriter.flush();
    fileWriter.close();
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);

    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS complex_record " +
            "(myInt INT, myMap MAP<STRING, DOUBLE>, myEnum STRING, myLongOrFloat UNIONTYPE<BIGINT, FLOAT>, myIntList ARRAY<INT>)"
            + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    TypeInfo resultSchema = TestNiFiOrcUtils.buildComplexOrcSchema();
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);

    // Check some fields in the first row
    Object intFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myInt"));
    assertTrue(intFieldObject instanceof IntWritable);
    assertEquals(10, ((IntWritable) intFieldObject).get());

    // This is pretty awkward and messy. The map object is a Map (not a MapWritable) but the keys are writables (in this case Text)
    // and so are the values (DoubleWritables in this case).
    Object mapFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMap"));
    assertTrue(mapFieldObject instanceof Map);
    Map map = (Map) mapFieldObject;
    Object mapValue = map.get(new Text("key1"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(1.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);

    mapValue = map.get(new Text("key2"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(2.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
}
 
Example 18
Source File: AvroExternalJarProgram.java    From stratosphere with Apache License 2.0 4 votes vote down vote up
public static void writeTestData(File testFile, int numRecords) throws IOException {
	
	DatumWriter<MyUser> userDatumWriter = new ReflectDatumWriter<MyUser>(MyUser.class);
	DataFileWriter<MyUser> dataFileWriter = new DataFileWriter<MyUser>(userDatumWriter);
	
	dataFileWriter.create(ReflectData.get().getSchema(MyUser.class), testFile);
	
	
	Generator generator = new Generator();
	
	for (int i = 0; i < numRecords; i++) {
		MyUser user = generator.nextUser();
		dataFileWriter.append(user);
	}
	
	dataFileWriter.close();
}
 
Example 19
Source File: AvroTableFileAsMutationsTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
@Test
public void testAvroToMutationsTransform() throws Exception {
  DdlToAvroSchemaConverter converter = new DdlToAvroSchemaConverter("spannertest", "booleans");
  Ddl ddl =
      Ddl.builder()
          .createTable("Users")
          .column("id")
          .int64()
          .notNull()
          .endColumn()
          .column("first_name")
          .string()
          .size(10)
          .endColumn()
          .column("last_name")
          .type(Type.string())
          .max()
          .endColumn()
          .primaryKey()
          .asc("id")
          .desc("last_name")
          .end()
          .endTable()
          .build();

  Collection<Schema> result = converter.convert(ddl);
  assertThat(result, hasSize(1));
  Schema usersSchema = result.iterator().next();

  GenericRecord user1 = new GenericData.Record(usersSchema);
  user1.put("id", 123L);
  user1.put("first_name", "John");
  user1.put("last_name", "Smith");
  GenericRecord user2 = new GenericData.Record(usersSchema);
  user2.put("id", 456L);
  user2.put("first_name", "Jane");
  user2.put("last_name", "Doe");

  File file = tmpFolder.newFile("users.avro");
  DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(usersSchema);
  DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter);
  dataFileWriter.create(usersSchema, file);
  dataFileWriter.append(user1);
  dataFileWriter.append(user2);
  dataFileWriter.close();

  PCollectionView<Ddl> ddlView = p.apply("ddl", Create.of(ddl)).apply(View.asSingleton());

  PCollection<Mutation> mutations =
      p.apply("files/tables", Create.of(ImmutableMap.of(file.toPath().toString(), "Users")))
          .apply(new AvroTableFileAsMutations(ddlView));

  PAssert.that(mutations)
      .containsInAnyOrder(
          Mutation.newInsertOrUpdateBuilder("Users")
              .set("id")
              .to(123L)
              .set("first_name")
              .to("John")
              .set("last_name")
              .to("Smith")
              .build(),
          Mutation.newInsertOrUpdateBuilder("Users")
              .set("id")
              .to(456L)
              .set("first_name")
              .to("Jane")
              .set("last_name")
              .to("Doe")
              .build());
  p.run();
}
 
Example 20
Source File: AvroAsJsonOutputFormat.java    From iow-hadoop-streaming with Apache License 2.0 4 votes vote down vote up
@Override
public RecordWriter<Text, NullWritable>
    getRecordWriter(FileSystem ignore, JobConf job, String name, Progressable prog)
    throws IOException {

    Schema schema;
    Schema.Parser p = new Schema.Parser();
    String strSchema = job.get("iow.streaming.output.schema");
    if (strSchema == null) {

        String schemaFile = job.get("iow.streaming.output.schema.file", "streaming_output_schema");

        if (job.getBoolean("iow.streaming.schema.use.prefix", false)) {
            // guess schema from file name
            // format is: schema:filename
            // with special keyword default - 'default:filename'

            String str[] = name.split(":");
            if (!str[0].equals("default"))
                schemaFile = str[0];

            name = str[1];
        }

        LOG.info(this.getClass().getSimpleName() + ": Using schema from file: " + schemaFile);
        File f = new File(schemaFile);
        schema = p.parse(f);
    }
    else {
        LOG.info(this.getClass().getSimpleName() + ": Using schema from jobconf.");
        schema = p.parse(strSchema);
    }

    if (schema == null) {
        throw new IOException("Can't find proper output schema");
    }

    DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>());

    configureDataFileWriter(writer, job);

    Path path = FileOutputFormat.getTaskOutputPath(job, name + org.apache.avro.mapred.AvroOutputFormat.EXT);
    writer.create(schema, path.getFileSystem(job).create(path));

    return createRecordWriter(writer, schema);
}