Java Code Examples for org.apache.avro.file.DataFileStream#next()

The following examples show how to use org.apache.avro.file.DataFileStream#next() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Examples.java    From datafu with Apache License 2.0 6 votes vote down vote up
private Long loadMemberCount(Path path, String timestamp) throws IOException
{
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(path, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      GenericRecord r = dataFileStream.next();
      Long count = (Long)((GenericRecord)r.get("value")).get("count");   
      Assert.assertNotNull(count);       
      System.out.println("found count: " + count);
      return count;
    }
    finally
    {
      dataFileStream.close();
    }
  }
  throw new RuntimeException("found no data");
}
 
Example 2
Source File: TestAvroStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
private void verifyResults(String outPath, String expectedOutpath, String expectedCodec) throws IOException {
    FileSystem fs = FileSystem.getLocal(new Configuration()) ;

    /* read in expected results*/
    Set<GenericData.Record> expected = getExpected (expectedOutpath);

    /* read in output results and compare */
    Path output = new Path(outPath);
    assertTrue("Output dir does not exists!", fs.exists(output)
            && fs.getFileStatus(output).isDir());

    Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter));
    assertTrue("Split field dirs not found!", paths != null);

    for (Path path : paths) {
      Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter));
      assertTrue("No files found for path: " + path.toUri().getPath(),
              files != null);
      for (Path filePath : files) {
        assertTrue("This shouldn't be a directory", fs.isFile(filePath));

        GenericDatumReader<GenericData.Record> reader = new GenericDatumReader<GenericData.Record>();

        DataFileStream<GenericData.Record> in = new DataFileStream<GenericData.Record>(
                                        fs.open(filePath), reader);
        assertEquals("codec", expectedCodec, in.getMetaString("avro.codec"));
        int count = 0;
        while (in.hasNext()) {
            GenericData.Record obj = in.next();
            assertTrue("Avro result object found that's not expected: Found "
                    + (obj != null ? obj.getSchema() : "null") + ", " + obj.toString()
                    + "\nExpected " + (expected != null ? expected.toString() : "null") + "\n"
                    , expected.contains(obj));
            count++;
        }
        in.close();
        assertEquals(expected.size(), count);
      }
    }
}
 
Example 3
Source File: PartitionCollapsingJoinTest.java    From datafu with Apache License 2.0 5 votes vote down vote up
private HashMap<Long,ImpressionClick> loadOutputCounts(String timestamp) throws IOException
{
  HashMap<Long,ImpressionClick> counts = new HashMap<Long,ImpressionClick>();
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(_outputPath,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Integer impressions = (Integer)((GenericRecord)r.get("value")).get("impressions");    
        Integer clicks = (Integer)((GenericRecord)r.get("value")).get("clicks");         
        Assert.assertFalse(counts.containsKey(memberId));
        ImpressionClick data = new ImpressionClick();
        data.clicks = clicks;
        data.impressions = impressions;
        counts.put(memberId, data);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}
 
Example 4
Source File: PartitionCollapsingTests.java    From datafu with Apache License 2.0 5 votes vote down vote up
private HashMap<Long,Long> loadOutputCounts(String timestamp) throws IOException
{
  HashMap<Long,Long> counts = new HashMap<Long,Long>();
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(_outputPath,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Long count = (Long)((GenericRecord)r.get("value")).get("count");        
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}
 
Example 5
Source File: TestAvroJob.java    From datafu with Apache License 2.0 5 votes vote down vote up
private HashMap<Long,Long> loadOutputCounts(String timestamp) throws IOException
{
  HashMap<Long,Long> counts = new HashMap<Long,Long>();
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(_outputPath,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)r.get("id");
        Long count = (Long)r.get("count");   
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}
 
Example 6
Source File: TestHDFSCompressedDataStream.java    From mt-flume with Apache License 2.0 5 votes vote down vote up
@Test
public void testGzipDurabilityWithSerializer() throws Exception {
  Context context = new Context();
  context.put("serializer", "AVRO_EVENT");

  HDFSCompressedDataStream writer = new HDFSCompressedDataStream();
  writer.configure(context);

  writer.open(fileURI, factory.getCodec(new Path(fileURI)),
      SequenceFile.CompressionType.BLOCK);

  String[] bodies = { "yarf!", "yarfing!" };
  writeBodies(writer, bodies);

  int found = 0;
  int expected = bodies.length;
  List<String> expectedBodies = Lists.newArrayList(bodies);

  GZIPInputStream cmpIn = new GZIPInputStream(new FileInputStream(file));
  DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> avroStream =
      new DataFileStream<GenericRecord>(cmpIn, reader);
  GenericRecord record = new GenericData.Record(avroStream.getSchema());
  while (avroStream.hasNext()) {
    avroStream.next(record);
    CharsetDecoder decoder = Charsets.UTF_8.newDecoder();
    String bodyStr = decoder.decode((ByteBuffer) record.get("body"))
        .toString();
    expectedBodies.remove(bodyStr);
    found++;
  }
  avroStream.close();
  cmpIn.close();

  Assert.assertTrue("Found = " + found + ", Expected = " + expected
      + ", Left = " + expectedBodies.size() + " " + expectedBodies,
      expectedBodies.size() == 0);
}
 
Example 7
Source File: Examples.java    From datafu with Apache License 2.0 5 votes vote down vote up
private HashMap<Long,Integer> loadOutputCounts(Path path, String timestamp) throws IOException
{
  HashMap<Long,Integer> counts = new HashMap<Long,Integer>();
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(path, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {          
        GenericRecord r = dataFileStream.next();
        _log.info("found: " + r.toString());
        Long memberId = (Long)((GenericRecord)r.get("key")).get("member_id");
        Assert.assertNotNull(memberId);
        Integer count = (Integer)((GenericRecord)r.get("value")).get("count");   
        Assert.assertNotNull(count);     
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}
 
Example 8
Source File: PartitionPreservingCollapsingIntegrationTests.java    From datafu with Apache License 2.0 5 votes vote down vote up
private HashMap<Long,Long> loadOutputCounts(Path path, String timestamp) throws IOException
{
  HashMap<Long,Long> counts = new HashMap<Long,Long>();
  FileSystem fs = getFileSystem();
  Assert.assertTrue(fs.exists(new Path(path, timestamp)));
  for (FileStatus stat : fs.globStatus(new Path(path,timestamp + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Long count = (Long)((GenericRecord)r.get("value")).get("count");        
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}
 
Example 9
Source File: PartitionPreservingCollapsingIntegrationTests.java    From datafu with Apache License 2.0 5 votes vote down vote up
private HashMap<Long,Long> loadIntermediateCounts(Path path, String timestamp) throws IOException
{
  HashMap<Long,Long> counts = new HashMap<Long,Long>();
  FileSystem fs = getFileSystem();
  String nestedPath = getNestedPathFromTimestamp(timestamp);
  Assert.assertTrue(fs.exists(new Path(_intermediatePath, nestedPath)));
  for (FileStatus stat : fs.globStatus(new Path(_intermediatePath,nestedPath + "/*.avro")))
  {
    _log.info(String.format("found: %s (%d bytes)",stat.getPath(),stat.getLen()));
    FSDataInputStream is = fs.open(stat.getPath());
    DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
    DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);
    
    try
    {
      while (dataFileStream.hasNext())
      {
        GenericRecord r = dataFileStream.next();
        Long memberId = (Long)((GenericRecord)r.get("key")).get("id");
        Long count = (Long)((GenericRecord)r.get("value")).get("count");        
        Assert.assertFalse(counts.containsKey(memberId));
        counts.put(memberId, count);
      }
    }
    finally
    {
      dataFileStream.close();
    }
  }
  return counts;
}
 
Example 10
Source File: TestAvroStorage.java    From spork with Apache License 2.0 5 votes vote down vote up
private Set<GenericData.Record> getExpected (String pathstr ) throws IOException {

        Set<GenericData.Record> ret = new TreeSet<GenericData.Record>(
                new Comparator<GenericData.Record>() {
                    @Override
                    public int compare(Record o1, Record o2) {
                        return o1.toString().compareTo(o2.toString());
                    }}
                );
        FileSystem fs = FileSystem.getLocal(new Configuration());

        /* read in output results and compare */
        Path output = new Path(pathstr);
        assertTrue("Expected output does not exists!", fs.exists(output));

        Path[] paths = FileUtil.stat2Paths(fs.listStatus(output, hiddenPathFilter));
        assertTrue("Split field dirs not found!", paths != null);

        for (Path path : paths) {
            Path[] files = FileUtil.stat2Paths(fs.listStatus(path, hiddenPathFilter));
            assertTrue("No files found for path: " + path.toUri().getPath(), files != null);
            for (Path filePath : files) {
                assertTrue("This shouldn't be a directory", fs.isFile(filePath));

                GenericDatumReader<GenericData.Record> reader = new GenericDatumReader<GenericData.Record>();

                DataFileStream<GenericData.Record> in = new DataFileStream<GenericData.Record>(fs.open(filePath), reader);

                while (in.hasNext()) {
                    GenericData.Record obj = in.next();
                    ret.add(obj);
                }
                in.close();
            }
        }
        return ret;
    }
 
Example 11
Source File: TestWriteAvroResultWithSchema.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Override
protected GenericRecord readRecord(final InputStream in, final Schema schema) throws IOException {
    final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(in, new GenericDatumReader<>());
    final Schema avroSchema = dataFileStream.getSchema();
    GenericData.setStringType(avroSchema, StringType.String);
    final GenericRecord avroRecord = dataFileStream.next();

    return avroRecord;
}
 
Example 12
Source File: TestHDFSEventSink.java    From mt-flume with Apache License 2.0 5 votes vote down vote up
private void verifyOutputAvroFiles(FileSystem fs, Configuration conf, String dir, String prefix, List<String> bodies) throws IOException {
  int found = 0;
  int expected = bodies.size();
  for(String outputFile : getAllFiles(dir)) {
    String name = (new File(outputFile)).getName();
    if(name.startsWith(prefix)) {
      FSDataInputStream input = fs.open(new Path(outputFile));
      DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
      DataFileStream<GenericRecord> avroStream =
          new DataFileStream<GenericRecord>(input, reader);
      GenericRecord record = new GenericData.Record(avroStream.getSchema());
      while (avroStream.hasNext()) {
        avroStream.next(record);
        ByteBuffer body = (ByteBuffer) record.get("body");
        CharsetDecoder decoder = Charsets.UTF_8.newDecoder();
        String bodyStr = decoder.decode(body).toString();
        LOG.debug("Removing event: {}", bodyStr);
        bodies.remove(bodyStr);
        found++;
      }
      avroStream.close();
      input.close();
    }
  }
  Assert.assertTrue("Found = " + found + ", Expected = "  +
      expected + ", Left = " + bodies.size() + " " + bodies,
        bodies.size() == 0);
}
 
Example 13
Source File: TestExecuteSQLRecord.java    From nifi with Apache License 2.0 4 votes vote down vote up
@Test
public void testWriteLOBsToAvro() throws Exception {
    final DBCPService dbcp = new DBCPServiceSimpleImpl("h2");
    final Map<String, String> dbcpProperties = new HashMap<>();

    runner = TestRunners.newTestRunner(ExecuteSQLRecord.class);
    runner.addControllerService("dbcp", dbcp, dbcpProperties);
    runner.enableControllerService(dbcp);
    runner.setProperty(AbstractExecuteSQL.DBCP_SERVICE, "dbcp");

    // remove previous test database, if any
    final File dbLocation = new File(DB_LOCATION);
    dbLocation.delete();

    // load test data to database
    final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection();
    Statement stmt = con.createStatement();

    try {
        stmt.execute("drop table TEST_NULL_INT");
    } catch (final SQLException sqle) {
    }

    stmt.execute("create table TEST_NULL_INT (id integer not null, val1 integer, val2 integer, image blob(1K), words clob(1K), "
            + "natwords nclob(1K), constraint my_pk primary key (id))");
    stmt.execute("insert into TEST_NULL_INT (id, val1, val2, image, words, natwords) VALUES (0, NULL, 1, CAST (X'DEADBEEF' AS BLOB), "
            + "CAST ('Hello World' AS CLOB), CAST ('I am an NCLOB' AS NCLOB))");

    runner.setIncomingConnection(false);
    runner.setProperty(AbstractExecuteSQL.SQL_SELECT_QUERY, "select * from TEST_NULL_INT");
    AvroRecordSetWriter recordWriter = new AvroRecordSetWriter();
    runner.addControllerService("writer", recordWriter);
    runner.setProperty(recordWriter, SchemaAccessUtils.SCHEMA_ACCESS_STRATEGY, SchemaAccessUtils.INHERIT_RECORD_SCHEMA);
    runner.setProperty(ExecuteSQLRecord.RECORD_WRITER_FACTORY, "writer");
    runner.enableControllerService(recordWriter);
    runner.run();

    runner.assertAllFlowFilesTransferred(AbstractExecuteSQL.REL_SUCCESS, 1);
    MockFlowFile flowFile = runner.getFlowFilesForRelationship(AbstractExecuteSQL.REL_SUCCESS).get(0);
    flowFile.assertAttributeEquals(AbstractExecuteSQL.RESULT_ROW_COUNT, "1");

    ByteArrayInputStream bais = new ByteArrayInputStream(flowFile.toByteArray());
    final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(bais, new GenericDatumReader<>());
    final Schema avroSchema = dataFileStream.getSchema();
    GenericData.setStringType(avroSchema, GenericData.StringType.String);
    final GenericRecord avroRecord = dataFileStream.next();

    Object imageObj = avroRecord.get("IMAGE");
    assertNotNull(imageObj);
    assertTrue(imageObj instanceof ByteBuffer);
    assertArrayEquals(new byte[]{(byte) 0xDE, (byte) 0xAD, (byte) 0xBE, (byte) 0xEF}, ((ByteBuffer) imageObj).array());

    Object wordsObj = avroRecord.get("WORDS");
    assertNotNull(wordsObj);
    assertTrue(wordsObj instanceof Utf8);
    assertEquals("Hello World", wordsObj.toString());

    Object natwordsObj = avroRecord.get("NATWORDS");
    assertNotNull(natwordsObj);
    assertTrue(natwordsObj instanceof Utf8);
    assertEquals("I am an NCLOB", natwordsObj.toString());
}
 
Example 14
Source File: BucketingSinkTest.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * This tests user defined hdfs configuration.
 * @throws Exception
 */
@Test
public void testUserDefinedConfiguration() throws Exception {
	final String outPath = hdfsURI + "/string-non-rolling-with-config";
	final int numElements = 20;

	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.INT);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	Configuration conf = new Configuration();
	conf.set("io.file.buffer.size", "40960");

	BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath)
		.setFSConfig(conf)
		.setWriter(new StreamWriterWithConfigCheck<Integer, String>(properties, "io.file.buffer.size", "40960"))
		.setBucketer(new BasePathBucketer<Tuple2<Integer, String>>())
		.setPartPrefix(PART_PREFIX)
		.setPendingPrefix("")
		.setPendingSuffix("");

	OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness =
		createTestSink(sink, 1, 0);

	testHarness.setProcessingTime(0L);

	testHarness.setup();
	testHarness.open();

	for (int i = 0; i < numElements; i++) {
		testHarness.processElement(new StreamRecord<>(Tuple2.of(
			i, "message #" + Integer.toString(i)
		)));
	}

	testHarness.close();

	GenericData.setStringType(valueSchema, GenericData.StringType.String);
	Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);

	FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

	SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
	DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
	for (int i = 0; i < numElements; i++) {
		AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry =
			new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
		int key = wrappedEntry.getKey();
		Assert.assertEquals(i, key);
		String value = wrappedEntry.getValue();
		Assert.assertEquals("message #" + i, value);
	}

	dataFileStream.close();
	inStream.close();
}
 
Example 15
Source File: BucketingSinkTest.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * This tests {@link AvroKeyValueSinkWriter}
 * with non-rolling output and with compression.
 */
@Test
public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception {
	final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";

	final int numElements = 20;

	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.INT);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath)
		.setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties))
		.setBucketer(new BasePathBucketer<Tuple2<Integer, String>>())
		.setPartPrefix(PART_PREFIX)
		.setPendingPrefix("")
		.setPendingSuffix("");

	OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness =
		createTestSink(sink, 1, 0);

	testHarness.setProcessingTime(0L);

	testHarness.setup();
	testHarness.open();

	for (int i = 0; i < numElements; i++) {
		testHarness.processElement(new StreamRecord<>(Tuple2.of(
			i, "message #" + Integer.toString(i)
		)));
	}

	testHarness.close();

	GenericData.setStringType(valueSchema, GenericData.StringType.String);
	Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);

	FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

	SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
	DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
	for (int i = 0; i < numElements; i++) {
		AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry =
			new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
		int key = wrappedEntry.getKey();
		Assert.assertEquals(i, key);
		String value = wrappedEntry.getValue();
		Assert.assertEquals("message #" + i, value);
	}

	dataFileStream.close();
	inStream.close();
}
 
Example 16
Source File: WholeFileTransformerProcessor.java    From datacollector with Apache License 2.0 4 votes vote down vote up
/**
 * Convert Avro record to Parquet
 * @param sourceFileName the source Avro file name
 * @param fileReader the {@link org.apache.avro.file.DataFileStream} Avro file reader
 * @param tempParquetFile the {@link java.nio.file.Path} temporary parquet file path
 */
private void writeParquet(String sourceFileName, DataFileStream<GenericRecord> fileReader, Path tempParquetFile) throws StageException {
  long recordCount = 0;
  GenericRecord avroRecord;
  Schema schema = fileReader.getSchema();

  LOG.debug("Start reading input file : {}", sourceFileName);
  try {
    // initialize parquet writer
    Configuration jobConfiguration = new Configuration();
    String compressionCodecName = compressionElEval.eval(variables, jobConfig.avroParquetConfig.compressionCodec, String.class);
    jobConfiguration.set(AvroParquetConstants.COMPRESSION_CODEC_NAME, compressionCodecName);
    jobConfiguration.setInt(AvroParquetConstants.ROW_GROUP_SIZE, jobConfig.avroParquetConfig.rowGroupSize);
    jobConfiguration.setInt(AvroParquetConstants.PAGE_SIZE, jobConfig.avroParquetConfig.pageSize);
    jobConfiguration.setInt(AvroParquetConstants.DICTIONARY_PAGE_SIZE, jobConfig.avroParquetConfig.dictionaryPageSize);
    jobConfiguration.setInt(AvroParquetConstants.MAX_PADDING_SIZE, jobConfig.avroParquetConfig.maxPaddingSize);

    // Parquet writer
    ParquetWriter.Builder builder = AvroToParquetConverterUtil.initializeWriter(
        new org.apache.hadoop.fs.Path(tempParquetFile.toString()),
        schema,
        jobConfiguration
    );
    parquetWriter = builder.build();

    while (fileReader.hasNext()) {
      avroRecord = fileReader.next();
      parquetWriter.write(avroRecord);
      recordCount++;
    }
    parquetWriter.close();

  } catch (IOException ex) {
    throw new TransformerStageCheckedException(
        Errors.CONVERT_08,
        sourceFileName,
        recordCount,
        ex
    );
  }
  LOG.debug("Finished writing {} records to {}", recordCount, tempParquetFile.getFileName());
}
 
Example 17
Source File: TestPutHiveStreaming.java    From localization_nifi with Apache License 2.0 4 votes vote down vote up
@Test
public void onTriggerMultipleRecords() throws Exception {
    runner.setProperty(PutHiveStreaming.METASTORE_URI, "thrift://localhost:9083");
    runner.setProperty(PutHiveStreaming.DB_NAME, "default");
    runner.setProperty(PutHiveStreaming.TABLE_NAME, "users");
    runner.setProperty(PutHiveStreaming.TXNS_PER_BATCH, "2");
    runner.setValidateExpressionUsage(false);
    Map<String, Object> user1 = new HashMap<String, Object>() {
        {
            put("name", "Joe");
            put("favorite_number", 146);
        }
    };
    Map<String, Object> user2 = new HashMap<String, Object>() {
        {
            put("name", "Mary");
            put("favorite_number", 42);
        }
    };
    Map<String, Object> user3 = new HashMap<String, Object>() {
        {
            put("name", "Matt");
            put("favorite_number", 3);
        }
    };
    runner.enqueue(createAvroRecord(Arrays.asList(user1, user2, user3)));
    runner.run();

    runner.assertTransferCount(PutHiveStreaming.REL_SUCCESS, 1);
    MockFlowFile resultFlowFile = runner.getFlowFilesForRelationship(PutHiveStreaming.REL_SUCCESS).get(0);
    assertNotNull(resultFlowFile);
    assertEquals("3", resultFlowFile.getAttribute(PutHiveStreaming.HIVE_STREAMING_RECORD_COUNT_ATTR));
    final DataFileStream<GenericRecord> reader = new DataFileStream<>(
            new ByteArrayInputStream(resultFlowFile.toByteArray()),
            new GenericDatumReader<GenericRecord>());

    Schema schema = reader.getSchema();

    // Verify that the schema is preserved
    assertTrue(schema.equals(new Schema.Parser().parse(new File("src/test/resources/user.avsc"))));

    // Verify the records are intact. We can't guarantee order so check the total number and non-null fields
    assertTrue(reader.hasNext());
    GenericRecord record = reader.next(null);
    assertNotNull(record.get("name"));
    assertNotNull(record.get("favorite_number"));
    assertNull(record.get("favorite_color"));
    assertNull(record.get("scale"));
    assertTrue(reader.hasNext());
    record = reader.next(record);
    assertTrue(reader.hasNext());
    reader.next(record);
    assertFalse(reader.hasNext());
}
 
Example 18
Source File: BucketingSinkTest.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * This tests user defined hdfs configuration.
 * @throws Exception
 */
@Test
public void testUserDefinedConfiguration() throws Exception {
	final String outPath = hdfsURI + "/string-non-rolling-with-config";
	final int numElements = 20;

	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.INT);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	Configuration conf = new Configuration();
	conf.set("io.file.buffer.size", "40960");

	BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath)
		.setFSConfig(conf)
		.setWriter(new StreamWriterWithConfigCheck<Integer, String>(properties, "io.file.buffer.size", "40960"))
		.setBucketer(new BasePathBucketer<Tuple2<Integer, String>>())
		.setPartPrefix(PART_PREFIX)
		.setPendingPrefix("")
		.setPendingSuffix("");

	OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness =
		createTestSink(sink, 1, 0);

	testHarness.setProcessingTime(0L);

	testHarness.setup();
	testHarness.open();

	for (int i = 0; i < numElements; i++) {
		testHarness.processElement(new StreamRecord<>(Tuple2.of(
			i, "message #" + Integer.toString(i)
		)));
	}

	testHarness.close();

	GenericData.setStringType(valueSchema, GenericData.StringType.String);
	Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);

	FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

	SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
	DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
	for (int i = 0; i < numElements; i++) {
		AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry =
			new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
		int key = wrappedEntry.getKey();
		Assert.assertEquals(i, key);
		String value = wrappedEntry.getValue();
		Assert.assertEquals("message #" + i, value);
	}

	dataFileStream.close();
	inStream.close();
}
 
Example 19
Source File: BucketingSinkTest.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * This tests user defined hdfs configuration.
 * @throws Exception
 */
@Test
public void testUserDefinedConfiguration() throws Exception {
	final String outPath = hdfsURI + "/string-non-rolling-with-config";
	final int numElements = 20;

	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.INT);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	Configuration conf = new Configuration();
	conf.set("io.file.buffer.size", "40960");

	BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath)
		.setFSConfig(conf)
		.setWriter(new StreamWriterWithConfigCheck<Integer, String>(properties, "io.file.buffer.size", "40960"))
		.setBucketer(new BasePathBucketer<Tuple2<Integer, String>>())
		.setPartPrefix(PART_PREFIX)
		.setPendingPrefix("")
		.setPendingSuffix("");

	OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness =
		createTestSink(sink, 1, 0);

	testHarness.setProcessingTime(0L);

	testHarness.setup();
	testHarness.open();

	for (int i = 0; i < numElements; i++) {
		testHarness.processElement(new StreamRecord<>(Tuple2.of(
			i, "message #" + Integer.toString(i)
		)));
	}

	testHarness.close();

	GenericData.setStringType(valueSchema, GenericData.StringType.String);
	Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);

	FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

	SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
	DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
	for (int i = 0; i < numElements; i++) {
		AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry =
			new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
		int key = wrappedEntry.getKey();
		Assert.assertEquals(i, key);
		String value = wrappedEntry.getValue();
		Assert.assertEquals("message #" + i, value);
	}

	dataFileStream.close();
	inStream.close();
}
 
Example 20
Source File: BucketingSinkTest.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * This tests {@link AvroKeyValueSinkWriter}
 * with non-rolling output and with compression.
 */
@Test
public void testNonRollingAvroKeyValueWithCompressionWriter() throws Exception {
	final String outPath = hdfsURI + "/avro-kv-no-comp-non-rolling-out";

	final int numElements = 20;

	Map<String, String> properties = new HashMap<>();
	Schema keySchema = Schema.create(Schema.Type.INT);
	Schema valueSchema = Schema.create(Schema.Type.STRING);
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_KEY_SCHEMA, keySchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_OUTPUT_VALUE_SCHEMA, valueSchema.toString());
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS, String.valueOf(true));
	properties.put(AvroKeyValueSinkWriter.CONF_COMPRESS_CODEC, DataFileConstants.SNAPPY_CODEC);

	BucketingSink<Tuple2<Integer, String>> sink = new BucketingSink<Tuple2<Integer, String>>(outPath)
		.setWriter(new AvroKeyValueSinkWriter<Integer, String>(properties))
		.setBucketer(new BasePathBucketer<Tuple2<Integer, String>>())
		.setPartPrefix(PART_PREFIX)
		.setPendingPrefix("")
		.setPendingSuffix("");

	OneInputStreamOperatorTestHarness<Tuple2<Integer, String>, Object> testHarness =
		createTestSink(sink, 1, 0);

	testHarness.setProcessingTime(0L);

	testHarness.setup();
	testHarness.open();

	for (int i = 0; i < numElements; i++) {
		testHarness.processElement(new StreamRecord<>(Tuple2.of(
			i, "message #" + Integer.toString(i)
		)));
	}

	testHarness.close();

	GenericData.setStringType(valueSchema, GenericData.StringType.String);
	Schema elementSchema = AvroKeyValueSinkWriter.AvroKeyValue.getSchema(keySchema, valueSchema);

	FSDataInputStream inStream = dfs.open(new Path(outPath + "/" + PART_PREFIX + "-0-0"));

	SpecificDatumReader<GenericRecord> elementReader = new SpecificDatumReader<>(elementSchema);
	DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(inStream, elementReader);
	for (int i = 0; i < numElements; i++) {
		AvroKeyValueSinkWriter.AvroKeyValue<Integer, String> wrappedEntry =
			new AvroKeyValueSinkWriter.AvroKeyValue<>(dataFileStream.next());
		int key = wrappedEntry.getKey();
		Assert.assertEquals(i, key);
		String value = wrappedEntry.getValue();
		Assert.assertEquals("message #" + i, value);
	}

	dataFileStream.close();
	inStream.close();
}