Java Code Examples for org.apache.avro.file.DataFileWriter#append()

The following examples show how to use org.apache.avro.file.DataFileWriter#append() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SmallFilesWrite.java    From hiped2 with Apache License 2.0 6 votes vote down vote up
public static void writeToAvro(File srcPath,
        OutputStream outputStream)
        throws IOException {
  DataFileWriter<Object> writer =
          new DataFileWriter<Object>(
              new GenericDatumWriter<Object>())
              .setSyncInterval(100);                 //<co id="ch02_smallfilewrite_comment2"/>
  writer.setCodec(CodecFactory.snappyCodec());   //<co id="ch02_smallfilewrite_comment3"/>
  writer.create(SCHEMA, outputStream);           //<co id="ch02_smallfilewrite_comment4"/>
  for (Object obj : FileUtils.listFiles(srcPath, null, false)) {
    File file = (File) obj;
    String filename = file.getAbsolutePath();
    byte content[] = FileUtils.readFileToByteArray(file);
    GenericRecord record = new GenericData.Record(SCHEMA);  //<co id="ch02_smallfilewrite_comment5"/>
    record.put(FIELD_FILENAME, filename);                   //<co id="ch02_smallfilewrite_comment6"/>
    record.put(FIELD_CONTENTS, ByteBuffer.wrap(content));   //<co id="ch02_smallfilewrite_comment7"/>
    writer.append(record);                                  //<co id="ch02_smallfilewrite_comment8"/>
    System.out.println(
            file.getAbsolutePath()
            + ": "
            + DigestUtils.md5Hex(content));
  }

  IOUtils.cleanup(null, writer);
  IOUtils.cleanup(null, outputStream);
}
 
Example 2
Source File: AvroKeyValueFileWrite.java    From hiped2 with Apache License 2.0 6 votes vote down vote up
public static void writeToAvro(File inputFile, OutputStream outputStream)
    throws IOException {

  DataFileWriter<GenericRecord> writer =
      new DataFileWriter<GenericRecord>(
          new GenericDatumWriter<GenericRecord>());

  writer.setCodec(CodecFactory.snappyCodec());
  writer.create(SCHEMA, outputStream);

  for (Stock stock : AvroStockUtils.fromCsvFile(inputFile)) {

    AvroKeyValue<CharSequence, Stock> record
        = new AvroKeyValue<CharSequence, Stock>(new GenericData.Record(SCHEMA));
    record.setKey(stock.getSymbol());
    record.setValue(stock);

    writer.append(record.get());
  }

  IOUtils.closeStream(writer);
  IOUtils.closeStream(outputStream);
}
 
Example 3
Source File: AvroStockFileWrite.java    From hiped2 with Apache License 2.0 6 votes vote down vote up
public static void writeToAvro(File inputFile, OutputStream outputStream)
    throws IOException {

  DataFileWriter<Stock> writer =
      new DataFileWriter<Stock>(
          new SpecificDatumWriter<Stock>());

  writer.setCodec(CodecFactory.snappyCodec());
  writer.create(Stock.SCHEMA$, outputStream);

  for (Stock stock : AvroStockUtils.fromCsvFile(inputFile)) {
    writer.append(stock);
  }

  IOUtils.closeStream(writer);
  IOUtils.closeStream(outputStream);
}
 
Example 4
Source File: Hdfs.java    From pxf with Apache License 2.0 6 votes vote down vote up
@Override
public void writeAvroFile(String pathToFile, String schemaName,
                          String codecName, IAvroSchema[] data)
        throws Exception {
    Path path = getDatapath(pathToFile);
    OutputStream outStream = fs.create(path, true, bufferSize,
            replicationSize, blockSize);
    Schema schema = new Schema.Parser().parse(new FileInputStream(
            schemaName));
    DatumWriter<GenericRecord> writer = new GenericDatumWriter<>(
            schema);
    DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(
            writer);
    if (!StringUtils.isEmpty(codecName)) {
        dataFileWriter.setCodec(CodecFactory.fromString(codecName));
    }

    dataFileWriter.create(schema, outStream);

    for (IAvroSchema iAvroSchema : data) {
        GenericRecord datum = iAvroSchema.serialize();
        dataFileWriter.append(datum);
    }
    dataFileWriter.close();
}
 
Example 5
Source File: RegressionAdmmTrain.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
private void computeU(JobConf conf, String uPath, String uplusxPath, Map<String, LinearModel> z) throws IOException
{
  AvroHdfsFileWriter<GenericRecord> writer =
      new AvroHdfsFileWriter<GenericRecord>(conf, uPath, LinearModelAvro.SCHEMA$);
  DataFileWriter<GenericRecord> recordwriter = writer.get();
  // read u+x
  for (Path path : Util.findPartFiles(conf, new Path(uplusxPath)))
  {
    DataFileStream<Object> stream = AvroUtils.getAvroDataStream(conf, path);
    while (stream.hasNext())
    {
      GenericData.Record record = (GenericData.Record) stream.next();
      String partitionID = Util.getStringAvro(record, "key", false);
      if (record.get("uplusx") != null)
      {
        String lambda = Util.getLambda(partitionID);
        LinearModel newu =
            new LinearModel(LibLinearDataset.INTERCEPT_NAME, (List<?>) record.get("uplusx"));
        newu.linearCombine(1.0, -1.0, z.get(lambda));
        GenericData.Record newvaluemap =
            new GenericData.Record(LinearModelAvro.SCHEMA$);
        List modellist = newu.toAvro(LibLinearDataset.INTERCEPT_NAME);
        newvaluemap.put("key", partitionID);
        newvaluemap.put("model", modellist);
        recordwriter.append(newvaluemap);
      }
    }
  }
  recordwriter.close();
}
 
Example 6
Source File: AvroTestTools.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
private void writeAsAvroBinary(Iterator<GenericRecord> input, Schema schema, FileSystem fs,
    Path outputPath) throws IOException {

  DataFileWriter writer = new DataFileWriter(new GenericDatumWriter());

  writer.create(schema, fs.create(outputPath, true));
  while (input.hasNext()) {
    writer.append(input.next());
  }
  writer.close();

  log.info("Successfully wrote avro file to path " + outputPath);
}
 
Example 7
Source File: JsonToAvroConverter.java    From celos with Apache License 2.0 5 votes vote down vote up
@Override
public FixFile convert(TestRun tr, FixFile ff) throws Exception {
    Schema schema = new Schema.Parser().parse(schemaCreator.create(tr).getContent());
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    InputStream input = ff.getContent();
    DataFileWriter<Object> writer;;
    try {
        DatumReader<Object> reader = new GenericDatumReader<>(schema);
        DataInputStream din = new DataInputStream(input);
        writer = new DataFileWriter<>(new GenericDatumWriter<>());
        writer.create(schema, baos);
        Decoder decoder = DecoderFactory.get().jsonDecoder(schema, din);
        Object datum;
        while (true) {
            try {
                datum = reader.read(null, decoder);
            } catch (EOFException eofe) {
                break;
            }
            writer.append(datum);
        }
        writer.flush();
    } finally {
        input.close();
    }
    return new FixFile(new ByteArrayInputStream(baos.toByteArray()));
}
 
Example 8
Source File: TestUtil.java    From localization_nifi with Apache License 2.0 5 votes vote down vote up
private static byte[] bytesFor(List<Record> records) throws IOException {
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    DataFileWriter<Record> writer = new DataFileWriter<>(
            AvroUtil.newDatumWriter(records.get(0).getSchema(), Record.class));
    writer.setCodec(CodecFactory.snappyCodec());
    writer = writer.create(records.get(0).getSchema(), out);

    for (Record record : records) {
        writer.append(record);
    }

    writer.flush();

    return out.toByteArray();
}
 
Example 9
Source File: TestAvroDataFileParser.java    From datacollector with Apache License 2.0 5 votes vote down vote up
@Test
public void testIncorrectOffset() throws Exception {
  File avroDataFile = SdcAvroTestUtil.createAvroDataFile();
  avroDataFile.delete();
  Schema schema = new Schema.Parser().parse(AVRO_SCHEMA);
  DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
  DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<>(datumWriter);
  dataFileWriter.create(schema, avroDataFile);
  for (int i = 0; i < 5; i++) {
    GenericRecord r = new GenericData.Record(schema);
    r.put("name", NAMES[i % NAMES.length]);
    r.put("id", i);
    dataFileWriter.setSyncInterval(1073741824);
    dataFileWriter.append(r);
    dataFileWriter.sync();
  }
  dataFileWriter.flush();
  dataFileWriter.close();
  DataParserFactoryBuilder dataParserFactoryBuilder = new DataParserFactoryBuilder(getContext(),
    DataParserFormat.AVRO);
  DataParserFactory factory = dataParserFactoryBuilder
      .setMaxDataLen(1024 * 1024)
      .setOverRunLimit(1000 * 1000)
      .setConfig(SCHEMA_SOURCE_KEY, SOURCE)
      .build();
  DataParser dataParser = factory.getParser(avroDataFile, null);
  Map<String, Record> records = new HashMap<>();
  Record record;
  while((record = dataParser.parse()) != null) {
    records.put(dataParser.getOffset(), record);
  }
  Assert.assertEquals(String.valueOf(records), 5, records.size());
  Assert.assertEquals(0, records.get("141::1").get("/id").getValueAsInteger());
  Assert.assertEquals(1, records.get("166::1").get("/id").getValueAsInteger());
  Assert.assertEquals(2, records.get("190::1").get("/id").getValueAsInteger());
  Assert.assertEquals(3, records.get("215::1").get("/id").getValueAsInteger());
  Assert.assertEquals(4, records.get("239::1").get("/id").getValueAsInteger());
}
 
Example 10
Source File: GenerateIds.java    From datafu with Apache License 2.0 4 votes vote down vote up
private void createDataForDate(FileSystem fs, Path outputPath, Date date) throws IOException
{
  // make sure output path exists
  if (!fs.exists(outputPath))
  {
    fs.mkdirs(outputPath);
  }
  
  Path datePath = new Path(outputPath,dateFormat.format(date));
  
  System.out.println("Writing to " + datePath.toString() + " with range " + startId + " to " + endId);
  
  DataFileWriter<GenericRecord> dataWriter;
  OutputStream outputStream;
  
  Path dailyPath = outputPath;
  Path path = new Path(dailyPath,dateFormat.format(date));
  
  // delete directory if it already exists
  if (fs.exists(path))
  {
    fs.delete(path, true);
  }
  
  outputStream = fs.create(new Path(path, "part-00000.avro"));
  
  GenericDatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>();
  dataWriter = new DataFileWriter<GenericRecord>(writer);        
  dataWriter.create(EVENT_SCHEMA, outputStream);
  
  GenericRecord record = new GenericData.Record(EVENT_SCHEMA);
  // create 1000 random IDs
  for (int i=0; i<1000; i++)
  {
    long val;
    if (startId == endId)
    {
      val = startId;
    }
    else
    {
      val = (long)(startId + random.nextInt(endId-startId+1));
    }
    record.put("id", val);
    dataWriter.append(record);
  }
  
  dataWriter.close();
  outputStream.close();
}
 
Example 11
Source File: AvroRecordInputFormatTest.java    From flink with Apache License 2.0 4 votes vote down vote up
public static void writeTestFile(File testFile) throws IOException {
	ArrayList<CharSequence> stringArray = new ArrayList<>();
	stringArray.add(TEST_ARRAY_STRING_1);
	stringArray.add(TEST_ARRAY_STRING_2);

	ArrayList<Boolean> booleanArray = new ArrayList<>();
	booleanArray.add(TEST_ARRAY_BOOLEAN_1);
	booleanArray.add(TEST_ARRAY_BOOLEAN_2);

	HashMap<CharSequence, Long> longMap = new HashMap<>();
	longMap.put(TEST_MAP_KEY1, TEST_MAP_VALUE1);
	longMap.put(TEST_MAP_KEY2, TEST_MAP_VALUE2);

	Address addr = new Address();
	addr.setNum(TEST_NUM);
	addr.setStreet(TEST_STREET);
	addr.setCity(TEST_CITY);
	addr.setState(TEST_STATE);
	addr.setZip(TEST_ZIP);

	User user1 = new User();

	user1.setName(TEST_NAME);
	user1.setFavoriteNumber(256);
	user1.setTypeDoubleTest(123.45d);
	user1.setTypeBoolTest(true);
	user1.setTypeArrayString(stringArray);
	user1.setTypeArrayBoolean(booleanArray);
	user1.setTypeEnum(TEST_ENUM_COLOR);
	user1.setTypeMap(longMap);
	user1.setTypeNested(addr);
	user1.setTypeBytes(ByteBuffer.allocate(10));
	user1.setTypeDate(LocalDate.parse("2014-03-01"));
	user1.setTypeTimeMillis(LocalTime.parse("12:12:12"));
	user1.setTypeTimeMicros(123456);
	user1.setTypeTimestampMillis(DateTime.parse("2014-03-01T12:12:12.321Z"));
	user1.setTypeTimestampMicros(123456L);
	// 20.00
	user1.setTypeDecimalBytes(ByteBuffer.wrap(BigDecimal.valueOf(2000, 2).unscaledValue().toByteArray()));
	// 20.00
	user1.setTypeDecimalFixed(new Fixed2(BigDecimal.valueOf(2000, 2).unscaledValue().toByteArray()));

	// Construct via builder
	User user2 = User.newBuilder()
			.setName("Charlie")
			.setFavoriteColor("blue")
			.setFavoriteNumber(null)
			.setTypeBoolTest(false)
			.setTypeDoubleTest(1.337d)
			.setTypeNullTest(null)
			.setTypeLongTest(1337L)
			.setTypeArrayString(new ArrayList<>())
			.setTypeArrayBoolean(new ArrayList<>())
			.setTypeNullableArray(null)
			.setTypeEnum(Colors.RED)
			.setTypeMap(new HashMap<>())
			.setTypeFixed(null)
			.setTypeUnion(null)
			.setTypeNested(
					Address.newBuilder().setNum(TEST_NUM).setStreet(TEST_STREET)
							.setCity(TEST_CITY).setState(TEST_STATE).setZip(TEST_ZIP)
							.build())
			.setTypeBytes(ByteBuffer.allocate(10))
			.setTypeDate(LocalDate.parse("2014-03-01"))
			.setTypeTimeMillis(LocalTime.parse("12:12:12"))
			.setTypeTimeMicros(123456)
			.setTypeTimestampMillis(DateTime.parse("2014-03-01T12:12:12.321Z"))
			.setTypeTimestampMicros(123456L)
			// 20.00
			.setTypeDecimalBytes(ByteBuffer.wrap(BigDecimal.valueOf(2000, 2).unscaledValue().toByteArray()))
			// 20.00
			.setTypeDecimalFixed(new Fixed2(BigDecimal.valueOf(2000, 2).unscaledValue().toByteArray()))
			.build();
	DatumWriter<User> userDatumWriter = new SpecificDatumWriter<>(User.class);
	DataFileWriter<User> dataFileWriter = new DataFileWriter<>(userDatumWriter);
	dataFileWriter.create(user1.getSchema(), testFile);
	dataFileWriter.append(user1);
	dataFileWriter.append(user2);
	dataFileWriter.close();
}
 
Example 12
Source File: AvroExternalJarProgram.java    From stratosphere with Apache License 2.0 4 votes vote down vote up
public static void writeTestData(File testFile, int numRecords) throws IOException {
	
	DatumWriter<MyUser> userDatumWriter = new ReflectDatumWriter<MyUser>(MyUser.class);
	DataFileWriter<MyUser> dataFileWriter = new DataFileWriter<MyUser>(userDatumWriter);
	
	dataFileWriter.create(ReflectData.get().getSchema(MyUser.class), testFile);
	
	
	Generator generator = new Generator();
	
	for (int i = 0; i < numRecords; i++) {
		MyUser user = generator.nextUser();
		dataFileWriter.append(user);
	}
	
	dataFileWriter.close();
}
 
Example 13
Source File: TestConvertAvroToORC.java    From localization_nifi with Apache License 2.0 4 votes vote down vote up
@Test
public void test_onTrigger_complex_record() throws Exception {

    Map<String, Double> mapData1 = new TreeMap<String, Double>() {{
        put("key1", 1.0);
        put("key2", 2.0);
    }};

    GenericData.Record record = TestNiFiOrcUtils.buildComplexAvroRecord(10, mapData1, "DEF", 3.0f, Arrays.asList(10, 20));

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);

    // Put another record in
    Map<String, Double> mapData2 = new TreeMap<String, Double>() {{
        put("key1", 3.0);
        put("key2", 4.0);
    }};

    record = TestNiFiOrcUtils.buildComplexAvroRecord(null, mapData2, "XYZ", 4L, Arrays.asList(100, 200));
    fileWriter.append(record);

    fileWriter.flush();
    fileWriter.close();
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);

    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS complex_record " +
            "(myInt INT, myMap MAP<STRING, DOUBLE>, myEnum STRING, myLongOrFloat UNIONTYPE<BIGINT, FLOAT>, myIntList ARRAY<INT>)"
            + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    TypeInfo resultSchema = TestNiFiOrcUtils.buildComplexOrcSchema();
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);

    // Check some fields in the first row
    Object intFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myInt"));
    assertTrue(intFieldObject instanceof IntWritable);
    assertEquals(10, ((IntWritable) intFieldObject).get());

    // This is pretty awkward and messy. The map object is a Map (not a MapWritable) but the keys are writables (in this case Text)
    // and so are the values (DoubleWritables in this case).
    Object mapFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMap"));
    assertTrue(mapFieldObject instanceof Map);
    Map map = (Map) mapFieldObject;
    Object mapValue = map.get(new Text("key1"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(1.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);

    mapValue = map.get(new Text("key2"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(2.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
}
 
Example 14
Source File: AvroWithoutSchemaRegistryProducer.java    From snowflake-kafka-connector with Apache License 2.0 4 votes vote down vote up
@Override
public void send(final Enums.TestCases testCase)
{
  System.out.println("loading table: " + testCase.getTableName() +
    " in format: " + testCase.getFormatName() + " to Kafka");
  try
  {
    Scanner scanner = getFileScanner(testCase);
    Schema schema = testCase.getTable().getSchema();
    while (scanner.hasNextLine())
    {
      GenericData.Record record = new GenericData.Record(schema);
      GenericDatumWriter<GenericRecord> writer = new GenericDatumWriter<>(schema);
      ByteArrayOutputStream output = new ByteArrayOutputStream();
      DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<>(writer);
      fileWriter.create(schema, output);

      JsonNode data = Utils.MAPPER.readTree(scanner.nextLine());
      switch (testCase.getTable())
      {
        case ONE_G_TABLE:
          record.put("C_CUSTKEY", data.get("C_CUSTKEY").asLong());
          record.put("C_NAME", data.get("C_NAME").asText());
          record.put("C_ADDRESS", data.get("C_ADDRESS").asText());
          record.put("C_PHONE", data.get("C_PHONE").asText());
          record.put("C_ACCTBAL", data.get("C_ACCTBAL").asDouble());
          record.put("C_MKTSEGMENT", data.get("C_MKTSEGMENT").asText());
          record.put("C_COMMENT", data.get("C_COMMENT").asText());
          record.put("C_NATIONKEY", data.get("C_NATIONKEY").asLong());
          break;
        case THREE_HUNDRED_COLUMN_TABLE:
          for (int i = 0; i < 300; i++)
          {
            switch (i % 8)
            {
              case 0:
                record.put("C" + i, data.get("C" + i).asDouble());
                break;
              case 2:
                record.put("C" + i, data.get("C" + i).asInt());
                break;
              case 4:
                record.put("C" + i, data.get("C" + i).asLong());
                break;
              case 6:
                record.put("C" + i, data.get("C" + i).asBoolean());
                break;
              default:
                record.put("C" + i, data.get("C" + i).asText());
            }
          }
      }

      fileWriter.append(record);
      fileWriter.flush();
      fileWriter.close();
      send(Utils.TEST_TOPIC, output.toByteArray());
    }
    scanner.close();
    close();
  }
  catch (Exception e)
  {
    e.printStackTrace();
    System.exit(1);
  }
  System.out.println("finished loading");

}
 
Example 15
Source File: AvroTableFileAsMutationsTest.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
@Test
public void testAvroToMutationsTransform() throws Exception {
  DdlToAvroSchemaConverter converter = new DdlToAvroSchemaConverter("spannertest", "booleans");
  Ddl ddl =
      Ddl.builder()
          .createTable("Users")
          .column("id")
          .int64()
          .notNull()
          .endColumn()
          .column("first_name")
          .string()
          .size(10)
          .endColumn()
          .column("last_name")
          .type(Type.string())
          .max()
          .endColumn()
          .primaryKey()
          .asc("id")
          .desc("last_name")
          .end()
          .endTable()
          .build();

  Collection<Schema> result = converter.convert(ddl);
  assertThat(result, hasSize(1));
  Schema usersSchema = result.iterator().next();

  GenericRecord user1 = new GenericData.Record(usersSchema);
  user1.put("id", 123L);
  user1.put("first_name", "John");
  user1.put("last_name", "Smith");
  GenericRecord user2 = new GenericData.Record(usersSchema);
  user2.put("id", 456L);
  user2.put("first_name", "Jane");
  user2.put("last_name", "Doe");

  File file = tmpFolder.newFile("users.avro");
  DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(usersSchema);
  DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(datumWriter);
  dataFileWriter.create(usersSchema, file);
  dataFileWriter.append(user1);
  dataFileWriter.append(user2);
  dataFileWriter.close();

  PCollectionView<Ddl> ddlView = p.apply("ddl", Create.of(ddl)).apply(View.asSingleton());

  PCollection<Mutation> mutations =
      p.apply("files/tables", Create.of(ImmutableMap.of(file.toPath().toString(), "Users")))
          .apply(new AvroTableFileAsMutations(ddlView));

  PAssert.that(mutations)
      .containsInAnyOrder(
          Mutation.newInsertOrUpdateBuilder("Users")
              .set("id")
              .to(123L)
              .set("first_name")
              .to("John")
              .set("last_name")
              .to("Smith")
              .build(),
          Mutation.newInsertOrUpdateBuilder("Users")
              .set("id")
              .to(456L)
              .set("first_name")
              .to("Jane")
              .set("last_name")
              .to("Doe")
              .build());
  p.run();
}
 
Example 16
Source File: TestConvertAvroToORC.java    From nifi with Apache License 2.0 4 votes vote down vote up
@Test
public void test_onTrigger_nested_complex_record() throws Exception {

    Map<String, List<Double>> mapData1 = new TreeMap<String, List<Double>>() {{
        put("key1", Arrays.asList(1.0, 2.0));
        put("key2", Arrays.asList(3.0, 4.0));
    }};

    Map<String, String> arrayMap11 = new TreeMap<String, String>() {{
        put("key1", "v1");
        put("key2", "v2");
    }};
    Map<String, String> arrayMap12 = new TreeMap<String, String>() {{
        put("key3", "v3");
        put("key4", "v4");
    }};

    GenericData.Record record = TestNiFiOrcUtils.buildNestedComplexAvroRecord(mapData1, Arrays.asList(arrayMap11, arrayMap12));

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);

    // Put another record in
    Map<String, List<Double>> mapData2 = new TreeMap<String, List<Double>>() {{
        put("key1", Arrays.asList(-1.0, -2.0));
        put("key2", Arrays.asList(-3.0, -4.0));
    }};

    Map<String, String> arrayMap21 = new TreeMap<String, String>() {{
        put("key1", "v-1");
        put("key2", "v-2");
    }};
    Map<String, String> arrayMap22 = new TreeMap<String, String>() {{
        put("key3", "v-3");
        put("key4", "v-4");
    }};

    record = TestNiFiOrcUtils.buildNestedComplexAvroRecord(mapData2, Arrays.asList(arrayMap21, arrayMap22));
    fileWriter.append(record);

    fileWriter.flush();
    fileWriter.close();
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);

    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS nested_complex_record " +
            "(myMapOfArray MAP<STRING, ARRAY<DOUBLE>>, myArrayOfMap ARRAY<MAP<STRING, STRING>>)"
            + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    TypeInfo resultSchema = TestNiFiOrcUtils.buildNestedComplexOrcSchema();
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);


    // check values
    Object myMapOfArray = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMapOfArray"));
    assertTrue(myMapOfArray instanceof Map);
    Map map = (Map) myMapOfArray;
    Object mapValue = map.get(new Text("key1"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof List);
    assertEquals(Arrays.asList(new DoubleWritable(1.0), new DoubleWritable(2.0)), mapValue);

    Object myArrayOfMap = inspector.getStructFieldData(o, inspector.getStructFieldRef("myArrayOfMap"));
    assertTrue(myArrayOfMap instanceof List);
    List list = (List) myArrayOfMap;
    Object el0 = list.get(0);
    assertNotNull(el0);
    assertTrue(el0 instanceof Map);
    assertEquals(new Text("v1"), ((Map) el0).get(new Text("key1")));
}
 
Example 17
Source File: TestConvertAvroToORC.java    From nifi with Apache License 2.0 4 votes vote down vote up
@Test
public void test_onTrigger_complex_record() throws Exception {

    Map<String, Double> mapData1 = new TreeMap<String, Double>() {{
        put("key1", 1.0);
        put("key2", 2.0);
    }};

    GenericData.Record record = TestNiFiOrcUtils.buildComplexAvroRecord(10, mapData1, "DEF", 3.0f, Arrays.asList(10, 20));

    DatumWriter<GenericData.Record> writer = new GenericDatumWriter<>(record.getSchema());
    DataFileWriter<GenericData.Record> fileWriter = new DataFileWriter<>(writer);
    ByteArrayOutputStream out = new ByteArrayOutputStream();
    fileWriter.create(record.getSchema(), out);
    fileWriter.append(record);

    // Put another record in
    Map<String, Double> mapData2 = new TreeMap<String, Double>() {{
        put("key1", 3.0);
        put("key2", 4.0);
    }};

    record = TestNiFiOrcUtils.buildComplexAvroRecord(null, mapData2, "XYZ", 4L, Arrays.asList(100, 200));
    fileWriter.append(record);

    fileWriter.flush();
    fileWriter.close();
    out.close();

    Map<String, String> attributes = new HashMap<String, String>() {{
        put(CoreAttributes.FILENAME.key(), "test");
    }};
    runner.enqueue(out.toByteArray(), attributes);
    runner.run();

    runner.assertAllFlowFilesTransferred(ConvertAvroToORC.REL_SUCCESS, 1);

    // Write the flow file out to disk, since the ORC Reader needs a path
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(ConvertAvroToORC.REL_SUCCESS).get(0);
    assertEquals("CREATE EXTERNAL TABLE IF NOT EXISTS complex_record " +
            "(myInt INT, myMap MAP<STRING, DOUBLE>, myEnum STRING, myLongOrFloat UNIONTYPE<BIGINT, FLOAT>, myIntList ARRAY<INT>)"
            + " STORED AS ORC", resultFlowFile.getAttribute(ConvertAvroToORC.HIVE_DDL_ATTRIBUTE));
    assertEquals("2", resultFlowFile.getAttribute(ConvertAvroToORC.RECORD_COUNT_ATTRIBUTE));
    assertEquals("test.orc", resultFlowFile.getAttribute(CoreAttributes.FILENAME.key()));
    byte[] resultContents = runner.getContentAsByteArray(resultFlowFile);
    FileOutputStream fos = new FileOutputStream("target/test1.orc");
    fos.write(resultContents);
    fos.flush();
    fos.close();

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    Reader reader = OrcFile.createReader(new Path("target/test1.orc"), OrcFile.readerOptions(conf).filesystem(fs));
    RecordReader rows = reader.rows();
    Object o = rows.next(null);
    assertNotNull(o);
    assertTrue(o instanceof OrcStruct);
    TypeInfo resultSchema = TestNiFiOrcUtils.buildComplexOrcSchema();
    StructObjectInspector inspector = (StructObjectInspector) OrcStruct.createObjectInspector(resultSchema);

    // Check some fields in the first row
    Object intFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myInt"));
    assertTrue(intFieldObject instanceof IntWritable);
    assertEquals(10, ((IntWritable) intFieldObject).get());

    Object mapFieldObject = inspector.getStructFieldData(o, inspector.getStructFieldRef("myMap"));
    assertTrue(mapFieldObject instanceof Map);
    Map map = (Map) mapFieldObject;
    Object mapValue = map.get(new Text("key1"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(1.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);

    mapValue = map.get(new Text("key2"));
    assertNotNull(mapValue);
    assertTrue(mapValue instanceof DoubleWritable);
    assertEquals(2.0, ((DoubleWritable) mapValue).get(), Double.MIN_VALUE);
}
 
Example 18
Source File: AvroExternalJarProgram.java    From flink with Apache License 2.0 4 votes vote down vote up
public static void writeTestData(File testFile, int numRecords) throws IOException {

		DatumWriter<MyUser> userDatumWriter = new ReflectDatumWriter<MyUser>(MyUser.class);
		DataFileWriter<MyUser> dataFileWriter = new DataFileWriter<MyUser>(userDatumWriter);

		dataFileWriter.create(ReflectData.get().getSchema(MyUser.class), testFile);

		Generator generator = new Generator();

		for (int i = 0; i < numRecords; i++) {
			MyUser user = generator.nextUser();
			dataFileWriter.append(user);
		}

		dataFileWriter.close();
	}
 
Example 19
Source File: AvroUserTest.java    From yuzhouwan with Apache License 2.0 4 votes vote down vote up
@Test
public void createUserTest() throws Exception {

    // 1. Creating Users
    User user1 = new User();
    user1.setName("Alyssa");
    user1.setFavoriteNumber(256);

    // Alternate constructor
    User user2 = new User("Ben", 7, "red");

    // Construct via builder
    User user3 = User.newBuilder().setName("Charlie").setFavoriteColor("blue").setFavoriteNumber(null).build();

    // 2. Serializing
    // Serialize user1, user2 and user3 to disk
    DatumWriter<User> userDatumWriter = new SpecificDatumWriter<>(User.class);
    DataFileWriter<User> dataFileWriter = new DataFileWriter<>(userDatumWriter);
    String avroDir = DirUtils.RESOURCES_PATH.concat("/avro");
    DirUtils.makeSureExist(avroDir, false);
    File file = new File(avroDir.concat("/users.avro"));
    dataFileWriter.create(user1.getSchema(), file);
    dataFileWriter.append(user1);
    dataFileWriter.append(user2);
    dataFileWriter.append(user3);
    // There should have more user object, then will get more performance
    dataFileWriter.close();

    // 3. Deserializing
    // Deserialize Users from disk
    DatumReader<User> userDatumReader = new SpecificDatumReader<>(User.class);
    DataFileReader<User> dataFileReader = new DataFileReader<>(file, userDatumReader);
    User user = null;
    String userStr;
    int count = 0;
    while (dataFileReader.hasNext()) {
        // Reuse user object by passing it to next(). This saves us from
        // allocating and garbage collecting many objects for files with many items.
        user = dataFileReader.next(user);
        if ("{\"name\": \"Alyssa\", \"favorite_number\": 256, \"favorite_color\": null}".equals(userStr = user.toString()) ||
                "{\"name\": \"Ben\", \"favorite_number\": 7, \"favorite_color\": \"red\"}".equals(userStr) ||
                "{\"name\": \"Charlie\", \"favorite_number\": null, \"favorite_color\": \"blue\"}".equals(userStr))
            count++;
    }
    assertEquals(3, count);
    file.deleteOnExit();
}
 
Example 20
Source File: BackfillPhaseMapJob.java    From incubator-pinot with Apache License 2.0 4 votes vote down vote up
private void createAvro(String dataFilePath) throws Exception {

      Path hdfsDataPath = new Path(dataFilePath);
      File dataPath = new File(currentDiskWorkDir, "data");
      if (dataPath.exists()) {
        dataPath.delete();
      }
      dataPath.mkdir();
      LOGGER.info("Creating temporary data dir {}", dataPath);

      final File avroPath = new File(currentDiskWorkDir, "avro");
      if (avroPath.exists()) {
        avroPath.delete();
      }
      avroPath.mkdir();
      LOGGER.info("Creating temporary avro dir {}", avroPath);

      String segmentName = hdfsDataPath.getName();
      final Path localFilePath = new Path(dataPath + "/" + segmentName);
      fs.copyToLocalFile(hdfsDataPath, localFilePath);
      LOGGER.info("Copying segment {} from {} to local {}", segmentName, hdfsDataPath, localFilePath);
      File segmentIndexDir = new File(localFilePath.toString());
      if (!segmentIndexDir.exists()) {
        throw new IllegalStateException("Failed to copy " + hdfsDataPath + " to " + localFilePath);
      }

      LOGGER.info("Initializing PinotSegmentRecordReader with segment index dir {}", segmentIndexDir);
      PinotSegmentRecordReader pinotSegmentRecordReader = new PinotSegmentRecordReader(segmentIndexDir);
      LOGGER.info("Schema {}", pinotSegmentRecordReader.getSchema());

      Schema avroSchema = ThirdeyeAvroUtils.constructAvroSchemaFromPinotSchema(pinotSegmentRecordReader.getSchema());
      GenericDatumWriter<GenericRecord> datum = new GenericDatumWriter<GenericRecord>(avroSchema);
      DataFileWriter<GenericRecord> recordWriter = new DataFileWriter<GenericRecord>(datum);
      File localAvroFile = new File(avroPath, segmentName + ThirdEyeConstants.AVRO_SUFFIX);
      recordWriter.create(avroSchema, localAvroFile);

      LOGGER.info("Converting pinot segment to avro at {}", localAvroFile);
      while (pinotSegmentRecordReader.hasNext()) {
        GenericRecord outputRecord = new Record(avroSchema);
        GenericRow row = pinotSegmentRecordReader.next();
        for (String fieldName : row.getFieldNames()) {
          outputRecord.put(fieldName, row.getValue(fieldName));
        }
        recordWriter.append(outputRecord);
      }
      LOGGER.info("Writing to avro file at {}", localAvroFile);
      recordWriter.close();
      if (!localAvroFile.exists()) {
        LOGGER.info("Failed to write avro file to {}", localAvroFile);
      }
      pinotSegmentRecordReader.close();

      LOGGER.info("Coping avro file from {} to hdfs at {}", localAvroFile, outputPath);
      fs.copyFromLocalFile(true, true, new Path(localAvroFile.toString()), new Path(outputPath));
      if (!fs.exists(new Path(outputPath))) {
        throw new IllegalStateException("Failed to copy avro file to hdfs at " + outputPath );
      }
      LOGGER.info("Successfully copied {} to {}", localAvroFile, outputPath);
    }