Java Code Examples for org.apache.avro.file.DataFileReader#hasNext()
The following examples show how to use
org.apache.avro.file.DataFileReader#hasNext() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroOutputFormatTest.java From flink with Apache License 2.0 | 6 votes |
@Test public void testGenericRecord() throws IOException { final Path outputPath = new Path(File.createTempFile("avro-output-file", "generic.avro").getAbsolutePath()); final AvroOutputFormat<GenericRecord> outputFormat = new AvroOutputFormat<>(outputPath, GenericRecord.class); Schema schema = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"user\", \"fields\": [{\"name\":\"user_name\", \"type\":\"string\"}, {\"name\":\"favorite_number\", \"type\":\"int\"}, {\"name\":\"favorite_color\", \"type\":\"string\"}]}"); outputFormat.setWriteMode(FileSystem.WriteMode.OVERWRITE); outputFormat.setSchema(schema); output(outputFormat, schema); GenericDatumReader<GenericRecord> reader = new GenericDatumReader<>(schema); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(new File(outputPath.getPath()), reader); while (dataFileReader.hasNext()) { GenericRecord record = dataFileReader.next(); assertEquals(record.get("user_name").toString(), "testUser"); assertEquals(record.get("favorite_number"), 1); assertEquals(record.get("favorite_color").toString(), "blue"); } //cleanup FileSystem fs = FileSystem.getLocalFileSystem(); fs.delete(outputPath, false); }
Example 2
Source File: AvroOutputFormatTest.java From Flink-CEPplus with Apache License 2.0 | 6 votes |
@Test public void testGenericRecord() throws IOException { final Path outputPath = new Path(File.createTempFile("avro-output-file", "generic.avro").getAbsolutePath()); final AvroOutputFormat<GenericRecord> outputFormat = new AvroOutputFormat<>(outputPath, GenericRecord.class); Schema schema = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"user\", \"fields\": [{\"name\":\"user_name\", \"type\":\"string\"}, {\"name\":\"favorite_number\", \"type\":\"int\"}, {\"name\":\"favorite_color\", \"type\":\"string\"}]}"); outputFormat.setWriteMode(FileSystem.WriteMode.OVERWRITE); outputFormat.setSchema(schema); output(outputFormat, schema); GenericDatumReader<GenericRecord> reader = new GenericDatumReader<>(schema); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(new File(outputPath.getPath()), reader); while (dataFileReader.hasNext()) { GenericRecord record = dataFileReader.next(); assertEquals(record.get("user_name").toString(), "testUser"); assertEquals(record.get("favorite_number"), 1); assertEquals(record.get("favorite_color").toString(), "blue"); } //cleanup FileSystem fs = FileSystem.getLocalFileSystem(); fs.delete(outputPath, false); }
Example 3
Source File: TestAvroEventSerializer.java From mt-flume with Apache License 2.0 | 6 votes |
public void validateAvroFile(File file) throws IOException { // read the events back using GenericRecord DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(file, reader); GenericRecord record = new GenericData.Record(fileReader.getSchema()); int numEvents = 0; while (fileReader.hasNext()) { fileReader.next(record); String bodyStr = record.get("message").toString(); System.out.println(bodyStr); numEvents++; } fileReader.close(); Assert.assertEquals("Should have found a total of 3 events", 3, numEvents); }
Example 4
Source File: ReadActivityFile.java From big-data-lite with MIT License | 6 votes |
/** *Reads the avro file * @throws IOException */ private void readFile() throws IOException { // Deserialize Activities from disk File file = new File(filename); DatumReader<Activity> activityDatumReader = new SpecificDatumReader<Activity>(Activity.class); DataFileReader<Activity> dataFileReader = new DataFileReader<Activity>(file, activityDatumReader); Activity activity = null; int i = 0; while (dataFileReader.hasNext() && i < numrecs) { i++; activity = dataFileReader.next(activity); System.out.println(activity); } }
Example 5
Source File: TestFlumeEventAvroEventSerializer.java From mt-flume with Apache License 2.0 | 6 votes |
public void validateAvroFile(File file) throws IOException { // read the events back using GenericRecord DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(file, reader); GenericRecord record = new GenericData.Record(fileReader.getSchema()); int numEvents = 0; while (fileReader.hasNext()) { fileReader.next(record); ByteBuffer body = (ByteBuffer) record.get("body"); CharsetDecoder decoder = Charsets.UTF_8.newDecoder(); String bodyStr = decoder.decode(body).toString(); System.out.println(bodyStr); numEvents++; } fileReader.close(); Assert.assertEquals("Should have found a total of 3 events", 3, numEvents); }
Example 6
Source File: Purge.java From Cubert with Apache License 2.0 | 6 votes |
private void loadMembersToPurge(String filename) throws IOException { // TODO: "memberId" column name should be configurable DataFileReader<GenericRecord> dataFileReader = createDataFileReader(filename, true); while (dataFileReader.hasNext()) { GenericRecord record = dataFileReader.next(); Integer memberId = (Integer) record.get("memberId"); if (memberId == null) { throw new NullPointerException("memberId is null"); } membersToPurge.add(((Number) record.get("memberId")).intValue()); } dataFileReader.close(); }
Example 7
Source File: AvroOutputFormatTest.java From flink with Apache License 2.0 | 6 votes |
@Test public void testGenericRecord() throws IOException { final Path outputPath = new Path(File.createTempFile("avro-output-file", "generic.avro").getAbsolutePath()); final AvroOutputFormat<GenericRecord> outputFormat = new AvroOutputFormat<>(outputPath, GenericRecord.class); Schema schema = new Schema.Parser().parse("{\"type\":\"record\", \"name\":\"user\", \"fields\": [{\"name\":\"user_name\", \"type\":\"string\"}, {\"name\":\"favorite_number\", \"type\":\"int\"}, {\"name\":\"favorite_color\", \"type\":\"string\"}]}"); outputFormat.setWriteMode(FileSystem.WriteMode.OVERWRITE); outputFormat.setSchema(schema); output(outputFormat, schema); GenericDatumReader<GenericRecord> reader = new GenericDatumReader<>(schema); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(new File(outputPath.getPath()), reader); while (dataFileReader.hasNext()) { GenericRecord record = dataFileReader.next(); assertEquals(record.get("user_name").toString(), "testUser"); assertEquals(record.get("favorite_number"), 1); assertEquals(record.get("favorite_color").toString(), "blue"); } //cleanup FileSystem fs = FileSystem.getLocalFileSystem(); fs.delete(outputPath, false); }
Example 8
Source File: TestMergeContent.java From nifi with Apache License 2.0 | 5 votes |
private Map<String, GenericRecord> getGenericRecordMap(byte[] data, Schema schema, String key) throws IOException { // create a reader for the merged contet DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(schema); SeekableByteArrayInput input = new SeekableByteArrayInput(data); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(input, datumReader); // read all the records into a map to verify all the records are there Map<String,GenericRecord> records = new HashMap<>(); while (dataFileReader.hasNext()) { GenericRecord user = dataFileReader.next(); records.put(user.get(key).toString(), user); } return records; }
Example 9
Source File: AvroStringFieldEncryptorConverterTest.java From incubator-gobblin with Apache License 2.0 | 5 votes |
private GenericRecord getRecordFromFile(String path) throws IOException { DatumReader<GenericRecord> reader = new GenericDatumReader<>(); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(new File(path), reader); if (dataFileReader.hasNext()) { return dataFileReader.next(); } return null; }
Example 10
Source File: AvroStringFieldDecryptorConverterTest.java From incubator-gobblin with Apache License 2.0 | 5 votes |
private GenericRecord getRecordFromFile(String path) throws IOException { DatumReader<GenericRecord> reader = new GenericDatumReader<>(); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(new File(path), reader); while (dataFileReader.hasNext()) { return dataFileReader.next(); } return null; }
Example 11
Source File: Purge.java From Cubert with Apache License 2.0 | 5 votes |
private void purge(String src, String dst) throws IOException { DataFileReader<GenericRecord> dataFileReader = createDataFileReader(src, false); DataFileWriter<GenericRecord> writer = createDataFileWriter(dataFileReader); numRecords = 0; recordsPurged = 0; remainingRecords = 0; // Copy while (dataFileReader.hasNext()) { numRecords++; GenericRecord record = dataFileReader.next(); if (record == null) { continue; } Number column = (Number) record.get(columnName); if ((column == null) || (!membersToPurge.contains(column.intValue()))) { remainingRecords++; writer.append(record); } } recordsPurged = numRecords - remainingRecords; writer.close(); dataFileReader.close(); }
Example 12
Source File: SinkAvroTest.java From gcp-ingestion with Mozilla Public License 2.0 | 5 votes |
/** * Test for a single doctype being written out to the correct location. */ @Test public void testSingleDocumentType() throws IOException, SchemaNotFoundException { String input = Resources.getResource("testdata/avro-message-single-doctype.ndjson").getPath(); String schemas = Resources.getResource("avro/test-schema.tar.gz").getPath(); String output = outputPath + "/${document_namespace:-NONE}.${document_type:-NONE}.${document_version:-0}"; Sink.main(new String[] { "--inputFileFormat=json", "--inputType=file", "--input=" + input, "--outputType=avro", "--output=" + output, "--outputFileCompression=UNCOMPRESSED", "--schemasLocation=" + schemas, "--errorOutputFileCompression=UNCOMPRESSED", "--errorOutputType=stdout" }); assertThat("output count", getPrefixFileCount(outputPath, "namespace_0"), Matchers.greaterThan(0L)); AvroSchemaStore store = AvroSchemaStore.of(schemas, null); List<Path> paths = Files.walk(Paths.get(outputPath)).filter(Files::isRegularFile) .collect(Collectors.toList()); List<Integer> results = new ArrayList<>(); for (Path path : paths) { Schema schema = store.getSchema("namespace_0/foo/foo.1.avro.json"); DatumReader<GenericRecord> reader = new GenericDatumReader<>(schema); DataFileReader<GenericRecord> fileReader = new DataFileReader<>(path.toFile(), reader); while (fileReader.hasNext()) { GenericRecord record = fileReader.next(); results.add((Integer) record.get("test_int")); } fileReader.close(); } results.sort(null); assertEquals(results, Arrays.asList(1, 2, 3)); }
Example 13
Source File: TestMergeContent.java From localization_nifi with Apache License 2.0 | 5 votes |
private Map<String, GenericRecord> getGenericRecordMap(byte[] data, Schema schema, String key) throws IOException { // create a reader for the merged contet DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(schema); SeekableByteArrayInput input = new SeekableByteArrayInput(data); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(input, datumReader); // read all the records into a map to verify all the records are there Map<String,GenericRecord> records = new HashMap<>(); while (dataFileReader.hasNext()) { GenericRecord user = dataFileReader.next(); records.put(user.get(key).toString(), user); } return records; }
Example 14
Source File: TestApacheAvroEventSerializer.java From flume-plugins with MIT License | 4 votes |
@Test public void test() throws FileNotFoundException, IOException { // create the file, write some data OutputStream out = new FileOutputStream(testFile); String builderName = ApacheLogAvroEventSerializer.Builder.class.getName(); Context ctx = new Context(); ctx.put("syncInterval", "4096"); EventSerializer serializer = EventSerializerFactory.getInstance(builderName, ctx, out); serializer.afterCreate(); // must call this when a file is newly created List<Event> events = generateApacheEvents(); for (Event e : events) { serializer.write(e); } serializer.flush(); serializer.beforeClose(); out.flush(); out.close(); // now try to read the file back DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(testFile, reader); GenericRecord record = new GenericData.Record(fileReader.getSchema()); int numEvents = 0; while (fileReader.hasNext()) { fileReader.next(record); String ip = record.get("ip").toString(); String uri = record.get("uri").toString(); Integer statuscode = (Integer) record.get("statuscode"); String original = record.get("original").toString(); String connectionstatus = record.get("connectionstatus").toString(); Assert.assertEquals("Ip should be 80.79.194.3", "80.79.194.3", ip); System.out.println("IP " + ip + " requested: " + uri + " with status code " + statuscode + " and connectionstatus: " + connectionstatus); System.out.println("Original logline: " + original); numEvents++; } fileReader.close(); Assert.assertEquals("Should have found a total of 3 events", 2, numEvents); FileUtils.forceDelete(testFile); }
Example 15
Source File: AvroUserTest.java From yuzhouwan with Apache License 2.0 | 4 votes |
@Test public void createUserTest() throws Exception { // 1. Creating Users User user1 = new User(); user1.setName("Alyssa"); user1.setFavoriteNumber(256); // Alternate constructor User user2 = new User("Ben", 7, "red"); // Construct via builder User user3 = User.newBuilder().setName("Charlie").setFavoriteColor("blue").setFavoriteNumber(null).build(); // 2. Serializing // Serialize user1, user2 and user3 to disk DatumWriter<User> userDatumWriter = new SpecificDatumWriter<>(User.class); DataFileWriter<User> dataFileWriter = new DataFileWriter<>(userDatumWriter); String avroDir = DirUtils.RESOURCES_PATH.concat("/avro"); DirUtils.makeSureExist(avroDir, false); File file = new File(avroDir.concat("/users.avro")); dataFileWriter.create(user1.getSchema(), file); dataFileWriter.append(user1); dataFileWriter.append(user2); dataFileWriter.append(user3); // There should have more user object, then will get more performance dataFileWriter.close(); // 3. Deserializing // Deserialize Users from disk DatumReader<User> userDatumReader = new SpecificDatumReader<>(User.class); DataFileReader<User> dataFileReader = new DataFileReader<>(file, userDatumReader); User user = null; String userStr; int count = 0; while (dataFileReader.hasNext()) { // Reuse user object by passing it to next(). This saves us from // allocating and garbage collecting many objects for files with many items. user = dataFileReader.next(user); if ("{\"name\": \"Alyssa\", \"favorite_number\": 256, \"favorite_color\": null}".equals(userStr = user.toString()) || "{\"name\": \"Ben\", \"favorite_number\": 7, \"favorite_color\": \"red\"}".equals(userStr) || "{\"name\": \"Charlie\", \"favorite_number\": null, \"favorite_color\": \"blue\"}".equals(userStr)) count++; } assertEquals(3, count); file.deleteOnExit(); }
Example 16
Source File: TestSyslogAvroEventSerializer.java From flume-plugins with MIT License | 4 votes |
@Test public void test() throws FileNotFoundException, IOException { // create the file, write some data OutputStream out = new FileOutputStream(testFile); String builderName = SyslogAvroEventSerializer.Builder.class.getName(); Context ctx = new Context(); ctx.put("syncInterval", "4096"); ctx.put("path", "src/test/resources/customerToHostsFile.txt"); EventSerializer serializer = EventSerializerFactory.getInstance(builderName, ctx, out); serializer.afterCreate(); // must call this when a file is newly created List<Event> events = generateSyslogEvents(); for (Event e : events) { serializer.write(e); } serializer.flush(); serializer.beforeClose(); out.flush(); out.close(); // now try to read the file back DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(testFile, reader); GenericRecord record = new GenericData.Record(fileReader.getSchema()); int numEvents = 0; while (fileReader.hasNext()) { fileReader.next(record); long timestamp = (Long) record.get("timestamp"); String datetime = record.get("datetime").toString(); String hostname = record.get("hostname").toString(); Map<String, String> headers = (Map<String, String>) record.get("headers"); String message = record.get("message").toString(); System.out.println(hostname + " (" + headers + ")" + ": " + message); numEvents++; } fileReader.close(); Assert.assertEquals("Should have found a total of 6 events", 6, numEvents); FileUtils.forceDelete(testFile); }
Example 17
Source File: TestJavaAvroEventSerializer.java From flume-plugins with MIT License | 4 votes |
@Test public void test() throws FileNotFoundException, IOException { // create the file, write some data OutputStream out = new FileOutputStream(testFile); String builderName = JavaLogAvroEventSerializer.Builder.class.getName(); Context ctx = new Context(); ctx.put("syncInterval", "4096"); EventSerializer serializer = EventSerializerFactory.getInstance(builderName, ctx, out); serializer.afterCreate(); // must call this when a file is newly created List<Event> events = generateJavaEvents(); for (Event e : events) { serializer.write(e); } serializer.flush(); serializer.beforeClose(); out.flush(); out.close(); // now try to read the file back DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(testFile, reader); GenericRecord record = new GenericData.Record(fileReader.getSchema()); int numEvents = 0; while (fileReader.hasNext()) { fileReader.next(record); long timestamp = (Long) record.get("timestamp"); String datetime = record.get("datetime").toString(); String classname = record.get("classname").toString(); String message = record.get("message").toString(); System.out.println(classname + ": " + message + " (at " + datetime + ")"); numEvents++; } fileReader.close(); Assert.assertEquals("Should have found a total of 4 events", 4, numEvents); FileUtils.forceDelete(testFile); }
Example 18
Source File: TestSyslogAvroEventSerializer.java From mt-flume with Apache License 2.0 | 4 votes |
@Test public void test() throws FileNotFoundException, IOException { // Snappy currently broken on Mac in OpenJDK 7 per FLUME-2012 Assume.assumeTrue(!"Mac OS X".equals(System.getProperty("os.name")) || !System.getProperty("java.version").startsWith("1.7.")); //Schema schema = new Schema.Parser().parse(schemaFile); // create the file, write some data OutputStream out = new FileOutputStream(testFile); String builderName = SyslogAvroEventSerializer.Builder.class.getName(); Context ctx = new Context(); ctx.put("syncInterval", "4096"); ctx.put("compressionCodec", "snappy"); EventSerializer serializer = EventSerializerFactory.getInstance(builderName, ctx, out); serializer.afterCreate(); // must call this when a file is newly created List<Event> events = generateSyslogEvents(); for (Event e : events) { serializer.write(e); } serializer.flush(); serializer.beforeClose(); out.flush(); out.close(); // now try to read the file back DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileReader<GenericRecord> fileReader = new DataFileReader<GenericRecord>(testFile, reader); GenericRecord record = new GenericData.Record(fileReader.getSchema()); int numEvents = 0; while (fileReader.hasNext()) { fileReader.next(record); int facility = (Integer) record.get("facility"); int severity = (Integer) record.get("severity"); long timestamp = (Long) record.get("timestamp"); String hostname = record.get("hostname").toString(); String message = record.get("message").toString(); Assert.assertEquals("Facility should be 1", 1, facility); System.out.println(timestamp + ": " + message); numEvents++; } fileReader.close(); Assert.assertEquals("Should have found a total of 3 events", 3, numEvents); FileUtils.forceDelete(testFile); }
Example 19
Source File: TestFlumeFailoverTarget.java From datacollector with Apache License 2.0 | 4 votes |
@Test public void testWriteAvroRecords() throws InterruptedException, StageException, IOException { DataGeneratorFormatConfig dataGeneratorFormatConfig = new DataGeneratorFormatConfig(); dataGeneratorFormatConfig.avroSchema = SdcAvroTestUtil.AVRO_SCHEMA1; dataGeneratorFormatConfig.avroSchemaSource = INLINE; dataGeneratorFormatConfig.includeSchema = true; dataGeneratorFormatConfig.avroCompression = AvroCompression.NULL; FlumeTarget flumeTarget = FlumeTestUtil.createFlumeTarget( FlumeTestUtil.createDefaultFlumeConfig(port, false), DataFormat.AVRO, dataGeneratorFormatConfig ); TargetRunner targetRunner = new TargetRunner.Builder(FlumeDTarget.class, flumeTarget).build(); targetRunner.runInit(); List<Record> records = SdcAvroTestUtil.getRecords1(); targetRunner.runWrite(records); targetRunner.runDestroy(); List<GenericRecord> genericRecords = new ArrayList<>(); DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); //Reader schema argument is optional Transaction transaction = ch.getTransaction(); transaction.begin(); Event event = ch.take(); while(event != null) { DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>( new SeekableByteArrayInput(event.getBody()), datumReader); while(dataFileReader.hasNext()) { genericRecords.add(dataFileReader.next()); } event = ch.take(); } transaction.commit(); transaction.close(); Assert.assertEquals(3, genericRecords.size()); SdcAvroTestUtil.compare1(genericRecords); }
Example 20
Source File: TestConvertAvroToParquet.java From nifi with Apache License 2.0 | 4 votes |
@Before public void setUp() throws Exception { processor = new ConvertAvroToParquet(); runner = TestRunners.newTestRunner(processor); Schema schema = new Schema.Parser().parse(Resources.getResource("avro/all-minus-enum.avsc").openStream()); DataFileWriter<Object> awriter = new DataFileWriter<Object>(new GenericDatumWriter<Object>()); GenericData.Record nestedRecord = new GenericRecordBuilder( schema.getField("mynestedrecord").schema()) .set("mynestedint", 1).build(); GenericData.Record record = new GenericRecordBuilder(schema) .set("mynull", null) .set("myboolean", true) .set("myint", 1) .set("mylong", 2L) .set("myfloat", 3.1f) .set("mydouble", 4.1) .set("mybytes", ByteBuffer.wrap("hello".getBytes(Charsets.UTF_8))) .set("mystring", "hello") .set("mynestedrecord", nestedRecord) .set("myarray", new GenericData.Array<Integer>(Schema.createArray(Schema.create(Schema.Type.INT)), Arrays.asList(1, 2))) .set("mymap", ImmutableMap.of("a", 1, "b", 2)) .set("myfixed", new GenericData.Fixed(Schema.createFixed("ignored", null, null, 1), new byte[] { (byte) 65 })) .build(); awriter.create(schema, tmpAvro); awriter.append(record); awriter.flush(); awriter.close(); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema); DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(tmpAvro, datumReader); GenericRecord record1 = null; while (dataFileReader.hasNext()) { record1 = dataFileReader.next(record1); records.add(record1); } }