org.apache.avro.generic.GenericDatumReader Java Examples

The following examples show how to use org.apache.avro.generic.GenericDatumReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroEventDeserializer.java    From mt-flume with Apache License 2.0 6 votes vote down vote up
private void initialize() throws IOException, NoSuchAlgorithmException {
  SeekableResettableInputBridge in = new SeekableResettableInputBridge(ris);
  long pos = in.tell();
  in.seek(0L);
  fileReader = new DataFileReader<GenericRecord>(in,
      new GenericDatumReader<GenericRecord>());
  fileReader.sync(pos);

  schema = fileReader.getSchema();
  datumWriter = new GenericDatumWriter(schema);
  out = new ByteArrayOutputStream();
  encoder = EncoderFactory.get().binaryEncoder(out, encoder);

  schemaHash = SchemaNormalization.parsingFingerprint("CRC-64-AVRO", schema);
  schemaHashString = Hex.encodeHexString(schemaHash);
}
 
Example #2
Source File: AvroUtils.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
/**
 * Get the latest avro schema for a directory
 * @param directory the input dir that contains avro files
 * @param fs the {@link FileSystem} for the given directory.
 * @param latest true to return latest schema, false to return oldest schema
 * @return the latest/oldest schema in the directory
 * @throws IOException
 */
public static Schema getDirectorySchema(Path directory, FileSystem fs, boolean latest) throws IOException {
  Schema schema = null;
  try (Closer closer = Closer.create()) {
    List<FileStatus> files = getDirectorySchemaHelper(directory, fs);
    if (files == null || files.size() == 0) {
      LOG.warn("There is no previous avro file in the directory: " + directory);
    } else {
      FileStatus file = latest ? files.get(0) : files.get(files.size() - 1);
      LOG.debug("Path to get the avro schema: " + file);
      FsInput fi = new FsInput(file.getPath(), fs.getConf());
      GenericDatumReader<GenericRecord> genReader = new GenericDatumReader<>();
      schema = closer.register(new DataFileReader<>(fi, genReader)).getSchema();
    }
  } catch (IOException ioe) {
    throw new IOException("Cannot get the schema for directory " + directory, ioe);
  }
  return schema;
}
 
Example #3
Source File: TestSelectHive3QL.java    From nifi with Apache License 2.0 6 votes vote down vote up
private long getNumberOfRecordsFromStream(InputStream in) throws IOException {
    final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) {
        GenericRecord record = null;
        long recordsFromStream = 0;
        while (dataFileReader.hasNext()) {
            // Reuse record object by passing it to next(). This saves us from
            // allocating and garbage collecting many objects for files with
            // many items.
            record = dataFileReader.next(record);
            recordsFromStream += 1;
        }

        return recordsFromStream;
    }
}
 
Example #4
Source File: AvroToRestJsonEntryConverterTest.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
private void testConversion(RestEntry<JsonObject> expected, WorkUnitState actualWorkUnitState) throws DataConversionException, IOException, JSONException {
  Schema schema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/nested.avsc"));
  GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema);

  File tmp = File.createTempFile(this.getClass().getSimpleName(), null);
  tmp.deleteOnExit();
  try {
    FileUtils.copyInputStreamToFile(getClass().getResourceAsStream("/converter/nested.avro"), tmp);
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<GenericRecord>(tmp, datumReader);
    GenericRecord avroRecord = dataFileReader.next();

    AvroToRestJsonEntryConverter converter = new AvroToRestJsonEntryConverter();
    RestEntry<JsonObject> actual = converter.convertRecord(null, avroRecord, actualWorkUnitState).iterator().next();

    Assert.assertEquals(actual.getResourcePath(), expected.getResourcePath());
    JSONAssert.assertEquals(expected.getRestEntryVal().toString(), actual.getRestEntryVal().toString(), false);

    converter.close();
    dataFileReader.close();
  } finally {
    if (tmp != null) {
      tmp.delete();
    }
  }
}
 
Example #5
Source File: TestSplitAvro.java    From nifi with Apache License 2.0 6 votes vote down vote up
@Test
public void testRecordSplitDatafileOutputWithoutMetadata() throws IOException {
    final TestRunner runner = TestRunners.newTestRunner(new SplitAvro());
    runner.setProperty(SplitAvro.TRANSFER_METADATA, "false");

    runner.enqueue(users.toByteArray());
    runner.run();

    runner.assertTransferCount(SplitAvro.REL_SPLIT, 100);
    runner.assertTransferCount(SplitAvro.REL_ORIGINAL, 1);
    runner.assertTransferCount(SplitAvro.REL_FAILURE, 0);

    runner.getFlowFilesForRelationship(SplitAvro.REL_ORIGINAL).get(0).assertAttributeEquals(FRAGMENT_COUNT.key(), "100");
    final List<MockFlowFile> flowFiles = runner.getFlowFilesForRelationship(SplitAvro.REL_SPLIT);
    checkDataFileSplitSize(flowFiles, 1, false);

    for (final MockFlowFile flowFile : flowFiles) {
        try (final ByteArrayInputStream in = new ByteArrayInputStream(flowFile.toByteArray());
             final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY1));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY2));
            Assert.assertFalse(reader.getMetaKeys().contains(META_KEY3));
        }
    }
}
 
Example #6
Source File: MapOutputValue.java    From incubator-pinot with Apache License 2.0 6 votes vote down vote up
public static MapOutputValue fromBytes(byte[] bytes, Map<String, Schema> schemaMap)
    throws IOException {
  DataInputStream dataInputStream = new DataInputStream(new ByteArrayInputStream(bytes));
  int length = dataInputStream.readInt();
  byte[] sourceNameBytes = new byte[length];
  dataInputStream.read(sourceNameBytes);
  String schemaName = new String(sourceNameBytes);

  int recordDataLength = dataInputStream.readInt();

  byte[] recordBytes = new byte[recordDataLength];
  dataInputStream.read(recordBytes);
  Schema schema = schemaMap.get(schemaName);
  GenericRecord record = new GenericData.Record(schema);
  binaryDecoder = DecoderFactory.get().binaryDecoder(recordBytes, binaryDecoder);
  GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>(schema);
  gdr.read(record, binaryDecoder);
  return new MapOutputValue(schemaName, record);
}
 
Example #7
Source File: AvroDateRangeMetadata.java    From datafu with Apache License 2.0 6 votes vote down vote up
/**
 * Reads the date range from the metadata stored in an Avro file.
 * 
 * @param fs file system to access path
 * @param path path to get date range for
 * @return date range
 * @throws IOException IOException
 */
public static DateRange getOutputFileDateRange(FileSystem fs, Path path) throws IOException
{
  path = fs.listStatus(path, PathUtils.nonHiddenPathFilter)[0].getPath();
  FSDataInputStream dataInputStream = fs.open(path);
  DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader);
  
  try
  {
    return new DateRange(new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_START))),
                         new Date(Long.parseLong(dataFileStream.getMetaString(METADATA_DATE_END))));
  }
  finally
  {
    dataFileStream.close();
    dataInputStream.close();
  }
}
 
Example #8
Source File: Converter.java    From xml-avro with Apache License 2.0 6 votes vote down vote up
public static void avroToXml(File avroFile, File xmlFile) throws IOException {
    DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(protocol.getType("Element"));
    DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(avroFile, datumReader);

    GenericRecord record = dataFileReader.next();

    Document doc;
    try {
        doc = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
    } catch (ParserConfigurationException e) {
        throw new RuntimeException(e);
    }

    Element el = unwrapElement(record, doc);
    doc.appendChild(el);

    saveDocument(doc, xmlFile);
}
 
Example #9
Source File: TestSplitAvro.java    From localization_nifi with Apache License 2.0 6 votes vote down vote up
private void checkDataFileSplitSize(List<MockFlowFile> flowFiles, int expectedRecordsPerSplit, boolean checkMetadata) throws IOException {
    for (final MockFlowFile flowFile : flowFiles) {
        try (final ByteArrayInputStream in = new ByteArrayInputStream(flowFile.toByteArray());
            final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) {

            int count = 0;
            GenericRecord record = null;
            while (reader.hasNext()) {
                record = reader.next(record);
                Assert.assertNotNull(record.get("name"));
                Assert.assertNotNull(record.get("favorite_number"));
                count++;
            }
            assertEquals(expectedRecordsPerSplit, count);

            if (checkMetadata) {
                assertEquals(META_VALUE1, reader.getMetaString(META_KEY1));
                assertEquals(META_VALUE2, reader.getMetaLong(META_KEY2));
                assertEquals(META_VALUE3, new String(reader.getMeta(META_KEY3), "UTF-8"));
            }
        }
    }
}
 
Example #10
Source File: AvroMorphlineTest.java    From kite with Apache License 2.0 6 votes vote down vote up
private void runTweetContainer(String morphlineConfigFile, String[] fieldNames) throws Exception {
  File file = new File(RESOURCES_DIR + "/test-documents/sample-statuses-20120906-141433-medium.avro");
  morphline = createMorphline(morphlineConfigFile);    
  for (int j = 0; j < 3; j++) { // also test reuse of objects and low level avro buffers
    Record record = new Record();
    byte[] body = Files.toByteArray(file);    
    record.put(Fields.ATTACHMENT_BODY, body);
    collector.reset();
    startSession();
    Notifications.notifyBeginTransaction(morphline);
    assertTrue(morphline.process(record));
    assertEquals(1, collector.getNumStartEvents());
    assertEquals(2104, collector.getRecords().size());
    
    FileReader<GenericData.Record> reader = new DataFileReader(file, new GenericDatumReader());
    int i = 0;
    while (reader.hasNext()) {
      Record actual = collector.getRecords().get(i);
      GenericData.Record expected = reader.next();
      assertTweetEquals(expected, actual, fieldNames, i);
      i++;
    }    
    assertEquals(collector.getRecords().size(), i);
  }
}
 
Example #11
Source File: AvroScanner.java    From tajo with Apache License 2.0 6 votes vote down vote up
/**
 * Initializes the AvroScanner.
 */
@Override
public void init() throws IOException {
  if (targets == null) {
    targets = schema.toArray();
  }
  prepareProjection(targets);
  outTuple = new VTuple(projectionMap.length);

  Schema avroSchema = AvroUtil.getAvroSchema(meta, conf);
  avroFields = avroSchema.getFields();

  DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(avroSchema);
  SeekableInput input = new FsInput(fragment.getPath(), conf);
  dataFileReader = new DataFileReader<>(input, datumReader);
  super.init();
}
 
Example #12
Source File: TestHelper.java    From incubator-gobblin with Apache License 2.0 6 votes vote down vote up
public static void assertGenericRecords(File outputAvroFile, Schema schema) throws IOException {
  try (DataFileReader<GenericRecord> reader =
      new DataFileReader<>(outputAvroFile, new GenericDatumReader<GenericRecord>(schema))) {
    Iterator<GenericRecord> iterator = reader.iterator();

    GenericRecord record = iterator.next();
    Assert.assertEquals(record.get("name").toString(), "Alyssa");

    record = iterator.next();
    Assert.assertEquals(record.get("name").toString(), "Ben");

    record = iterator.next();
    Assert.assertEquals(record.get("name").toString(), "Charlie");

    Assert.assertFalse(iterator.hasNext());
  }
}
 
Example #13
Source File: OracleGenericSchemaDecoder.java    From DBus with Apache License 2.0 6 votes vote down vote up
private void initDecoder() {
    try {
        genericSchema = OracleGenericSchemaProvider.getInstance().getSchema("generic_wrapper.avsc");

        fullPullSchema = OracleGenericSchemaProvider.getInstance().getSchema("DBUS.DB_FULL_PULL_REQUESTS.avsc");
        fullPullHash = OracleGenericSchemaProvider.getInstance().getSchemaHash("DBUS.DB_FULL_PULL_REQUESTS.avsc");

        syncEventSchema = OracleGenericSchemaProvider.getInstance().getSchema("DBUS.META_SYNC_EVENT.avsc");
        syncEventHash = OracleGenericSchemaProvider.getInstance().getSchemaHash("DBUS.META_SYNC_EVENT.avsc");

        heartbeatSchema = OracleGenericSchemaProvider.getInstance().getSchema("DBUS.DB_HEARTBEAT_MONITOR.avsc");
        heartbeatHash = OracleGenericSchemaProvider.getInstance().getSchemaHash("DBUS.DB_HEARTBEAT_MONITOR.avsc");

        datumReader = new GenericDatumReader<>(genericSchema);
        datumWriter = new GenericDatumWriter<>(genericSchema);
    } catch (Exception e) {
        logger.error("OracleGenericSchemaDecoder Initialization Error!", e);
        e.printStackTrace();

    }
}
 
Example #14
Source File: AvroRecordWriterTest.java    From data-highway with Apache License 2.0 6 votes vote down vote up
@Test
public void typical() throws Exception {
  Schema schema = SchemaBuilder
      .builder()
      .record("record")
      .fields()
      .requiredLong("id")
      .requiredString("name")
      .endRecord();
  Record value = new GenericRecordBuilder(schema).set("id", 1L).set("name", "hello").build();
  ByteArrayOutputStream output = new ByteArrayOutputStream();

  Factory factory = new Factory(CodecFactory.nullCodec());
  RecordWriter writer = factory.create(schema, output);
  writer.write(value);
  writer.close();

  SeekableInput input = new SeekableByteArrayInput(output.toByteArray());
  DatumReader<Record> datumReader = new GenericDatumReader<>(schema);
  DataFileReader<Record> dataFileReader = new DataFileReader<>(input, datumReader);
  assertThat(dataFileReader.next(), is(value));
  assertThat(dataFileReader.hasNext(), is(false));
  dataFileReader.close();
}
 
Example #15
Source File: KafkaAvroExtractor.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
public KafkaAvroExtractor(WorkUnitState state) {
  super(state);
  this.schemaRegistry = state.contains(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_CLASS)
      ? Optional.of(KafkaSchemaRegistry.<K, Schema> get(state.getProperties()))
      : Optional.<KafkaSchemaRegistry<K, Schema>> absent();
  this.schema = getExtractorSchema();
  if (this.schema.isPresent()) {
    this.reader = Optional.of(new GenericDatumReader<Record>(this.schema.get()));
  } else {
    log.error(String.format("Cannot find latest schema for topic %s. This topic will be skipped", this.topicName));
    this.reader = Optional.absent();
  }
}
 
Example #16
Source File: FixedFlowInputBoundedReader.java    From components with Apache License 2.0 5 votes vote down vote up
/**
 * This method will instantiate correct Avro Schema object. This is mandatory since the "Schema" object of Avro are
 * not serializable.
 */
public void deserializeSchema() {
    if (schema == null) {
        Schema.Parser parser = new Schema.Parser();
        schema = parser.parse(schemaString);
        reader = new GenericDatumReader<>(schema);
    }
}
 
Example #17
Source File: TestWriteAvroResultWithSchema.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Override
protected List<GenericRecord> readRecords(final InputStream in, final Schema schema, final int recordCount) throws IOException {
    final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(in, new GenericDatumReader<>());
    final Schema avroSchema = dataFileStream.getSchema();
    GenericData.setStringType(avroSchema, StringType.String);

    List<GenericRecord> records = new ArrayList<>();
    for (int i = 0; i < recordCount; i++) {
        records.add(dataFileStream.next());
    }

    return records;
}
 
Example #18
Source File: TestDataModelUtil.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testGetDatumReaderForGenericType() {
  Class<GenericData.Record> type = GenericData.Record.class;
  Schema writerSchema = StandardEvent.getClassSchema();
  DatumReader result = DataModelUtil.getDatumReaderForType(type, writerSchema);
  assertEquals(GenericDatumReader.class, result.getClass());
}
 
Example #19
Source File: AvroUtils.java    From ml-ease with Apache License 2.0 5 votes vote down vote up
/**
 * Loads the schema from an Avro data file.
 * 
 * @param conf The JobConf.
 * @param path The path to the data file.
 * @return The schema read from the data file's metadata.
 * @throws IOException
 */
public static Schema getSchemaFromFile(JobConf conf, Path path) throws IOException
{
  FileSystem fs = path.getFileSystem(new Configuration());
  FSDataInputStream dataInputStream = fs.open(path);
  DatumReader <GenericRecord> reader = new GenericDatumReader<GenericRecord>();
  DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(dataInputStream, reader);
  return dataFileStream.getSchema();
}
 
Example #20
Source File: ServerSinkSourceConfigurationTest.java    From divolte-collector with Apache License 2.0 5 votes vote down vote up
private static Stream<GenericRecord> listRecords(final Path avroFile) {
    final GenericDatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
    logger.debug("Reading records from new Avro file: {}", avroFile);
    try (final FileReader<GenericRecord> fileReader = DataFileReader.openReader(avroFile.toFile(), datumReader)) {
        final ImmutableList<GenericRecord> records = ImmutableList.copyOf(fileReader.iterator());
        logger.info("Read {} record(s) from new Avro file: {}", records.size(), avroFile);
        return records.stream();
    } catch (final IOException e) {
        throw new UncheckedIOException("Error reading records from file: " + avroFile, e);
    }
}
 
Example #21
Source File: AvroCodec.java    From schema-evolution-samples with Apache License 2.0 5 votes vote down vote up
private DatumReader getDatumReader(Class<?> type, Schema writer){
	DatumReader reader = null;
	if(SpecificRecord.class.isAssignableFrom(type)){
		reader = new SpecificDatumReader<>(writer,getReaderSchema(writer));
	}
	else if(GenericRecord.class.isAssignableFrom(type)){
		reader = new GenericDatumReader<>(writer,getReaderSchema(writer));
	}else{
		reader = new ReflectDatumReader<>(writer,getReaderSchema(writer));
	}

	return reader;
}
 
Example #22
Source File: TestExecuteSQL.java    From nifi with Apache License 2.0 5 votes vote down vote up
@Test
public void testCompression() throws SQLException, CompressorException, IOException {
    // remove previous test database, if any
    final File dbLocation = new File(DB_LOCATION);
    dbLocation.delete();

    // load test data to database
    final Connection con = ((DBCPService) runner.getControllerService("dbcp")).getConnection();
    Statement stmt = con.createStatement();

    try {
        stmt.execute("drop table TEST_NULL_INT");
    } catch (final SQLException sqle) {
    }

    stmt.execute("create table TEST_NULL_INT (id integer not null, val1 integer, val2 integer, constraint my_pk primary key (id))");

    stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (0, NULL, 1)");
    stmt.execute("insert into TEST_NULL_INT (id, val1, val2) VALUES (1, 1, 1)");

    runner.setIncomingConnection(false);
    runner.setProperty(ExecuteSQL.COMPRESSION_FORMAT, AvroUtil.CodecType.BZIP2.name());
    runner.setProperty(ExecuteSQL.SQL_SELECT_QUERY, "SELECT * FROM TEST_NULL_INT");
    runner.run();

    runner.assertAllFlowFilesTransferred(ExecuteSQL.REL_SUCCESS, 1);

    MockFlowFile flowFile = runner.getFlowFilesForRelationship(ExecuteSQL.REL_SUCCESS).get(0);

    try (DataFileStream<GenericRecord> dfs = new DataFileStream<>(new ByteArrayInputStream(flowFile.toByteArray()), new GenericDatumReader<GenericRecord>())) {
        assertEquals(AvroUtil.CodecType.BZIP2.name().toLowerCase(), dfs.getMetaString(DataFileConstants.CODEC).toLowerCase());
    }
}
 
Example #23
Source File: CustomAvroRecordPreparer.java    From pxf with Apache License 2.0 5 votes vote down vote up
private void initAvro() throws Exception {
    FileInputStream fis = new FileInputStream(schema_name);
    schema = new Schema.Parser().parse(fis);
    datum = new GenericData.Record(schema);
    writer = new GenericDatumWriter<>(schema);
    reader = new GenericDatumReader<>(schema);
    fct_en = EncoderFactory.get();
    fis.close();
}
 
Example #24
Source File: TestPutHive3Streaming.java    From nifi with Apache License 2.0 5 votes vote down vote up
private void assertOutputAvroRecords(List<Map<String, Object>> expectedRecords, MockFlowFile resultFlowFile) throws IOException {
    assertEquals(String.valueOf(expectedRecords.size()), resultFlowFile.getAttribute(PutHive3Streaming.HIVE_STREAMING_RECORD_COUNT_ATTR));

    final DataFileStream<GenericRecord> reader = new DataFileStream<>(
            new ByteArrayInputStream(resultFlowFile.toByteArray()),
            new GenericDatumReader<>());

    Schema schema = reader.getSchema();

    // Verify that the schema is preserved
    assertEquals(schema, new Schema.Parser().parse(new File("src/test/resources/user.avsc")));

    GenericRecord record = null;
    for (Map<String, Object> expectedRecord : expectedRecords) {
        assertTrue(reader.hasNext());
        record = reader.next(record);
        final String name = record.get("name").toString();
        final Integer favorite_number = (Integer) record.get("favorite_number");
        assertNotNull(name);
        assertNotNull(favorite_number);
        assertNull(record.get("favorite_color"));
        assertNull(record.get("scale"));

        assertEquals(expectedRecord.get("name"), name);
        assertEquals(expectedRecord.get("favorite_number"), favorite_number);
    }
    assertFalse(reader.hasNext());
}
 
Example #25
Source File: AvroUtilsTest.java    From kite with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadAvroEntity() throws Exception {
  String schemaString = "{ \"type\": \"int\" }";
  InputStream is = new ByteArrayInputStream(schemaString.getBytes());
  Schema schema = parser.parse(is);
  byte[] bytes = new byte[] { (byte) 1 };
  DatumReader<Integer> reader = new GenericDatumReader<Integer>(schema);
  Integer i = AvroUtils.readAvroEntity(bytes, reader);
  assertEquals(-1, i.intValue());
}
 
Example #26
Source File: AvroDataFileParser.java    From datacollector with Apache License 2.0 5 votes vote down vote up
public AvroDataFileParser(ProtoConfigurableEntity.Context context, Schema schema, File file, String readerOffset, int maxObjectLength, boolean skipUnionIndexes)
  throws IOException {
  this.context = context;
  this.file = file;
  this.skipUnionIndexes = skipUnionIndexes;
  DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(schema, schema, GenericData.get());
  sin = new SeekableOverrunFileInputStream(
    new FileInputStream(file), maxObjectLength, true);
  dataFileReader = new DataFileReader<>(sin, datumReader);
  if(readerOffset != null && !readerOffset.isEmpty() && !"0".equals(readerOffset)) {
    String[] split = readerOffset.split(OFFSET_SEPARATOR);
    if(split.length == 3) {
      //split[0] is the file name
      previousSync = Long.parseLong(split[1]);
      recordCount = Long.parseLong(split[2]);
      seekToOffset();
    } else if (split.length == 2) {
      previousSync = Long.parseLong(split[0]);
      recordCount = Long.parseLong(split[1]);
      seekToOffset();
    } else {
      throw new IllegalArgumentException(Utils.format("Invalid offset {}", readerOffset));
    }
  } else {
    recordCount = 0;
    previousSync = dataFileReader.previousSync();
  }
}
 
Example #27
Source File: AvroIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@SuppressWarnings("unchecked")
@Category(NeedsRunner.class)
public void testMetadata() throws Exception {
  List<GenericClass> values =
      ImmutableList.of(new GenericClass(3, "hi"), new GenericClass(5, "bar"));
  File outputFile = tmpFolder.newFile("output.avro");

  writePipeline
      .apply(Create.of(values))
      .apply(
          AvroIO.write(GenericClass.class)
              .to(outputFile.getAbsolutePath())
              .withoutSharding()
              .withMetadata(
                  ImmutableMap.of(
                      "stringKey",
                      "stringValue",
                      "longKey",
                      100L,
                      "bytesKey",
                      "bytesValue".getBytes(Charsets.UTF_8))));
  writePipeline.run();

  try (DataFileStream dataFileStream =
      new DataFileStream(new FileInputStream(outputFile), new GenericDatumReader())) {
    assertEquals("stringValue", dataFileStream.getMetaString("stringKey"));
    assertEquals(100L, dataFileStream.getMetaLong("longKey"));
    assertArrayEquals(
        "bytesValue".getBytes(Charsets.UTF_8), dataFileStream.getMeta("bytesKey"));
  }
}
 
Example #28
Source File: AvroCoder.java    From beam with Apache License 2.0 5 votes vote down vote up
protected AvroCoder(Class<T> type, Schema schema) {
  this.type = type;
  this.schemaSupplier = new SerializableSchemaSupplier(schema);
  typeDescriptor = TypeDescriptor.of(type);
  nonDeterministicReasons = new AvroDeterminismChecker().check(TypeDescriptor.of(type), schema);

  // Decoder and Encoder start off null for each thread. They are allocated and potentially
  // reused inside encode/decode.
  this.decoder = new EmptyOnDeserializationThreadLocal<>();
  this.encoder = new EmptyOnDeserializationThreadLocal<>();

  this.reflectData = Suppliers.memoize(new SerializableReflectDataSupplier(getType()));

  // Reader and writer are allocated once per thread per Coder
  this.reader =
      new EmptyOnDeserializationThreadLocal<DatumReader<T>>() {
        private final AvroCoder<T> myCoder = AvroCoder.this;

        @Override
        public DatumReader<T> initialValue() {
          return myCoder.getType().equals(GenericRecord.class)
              ? new GenericDatumReader<>(myCoder.getSchema())
              : new ReflectDatumReader<>(
                  myCoder.getSchema(), myCoder.getSchema(), myCoder.reflectData.get());
        }
      };

  this.writer =
      new EmptyOnDeserializationThreadLocal<DatumWriter<T>>() {
        private final AvroCoder<T> myCoder = AvroCoder.this;

        @Override
        public DatumWriter<T> initialValue() {
          return myCoder.getType().equals(GenericRecord.class)
              ? new GenericDatumWriter<>(myCoder.getSchema())
              : new ReflectDatumWriter<>(myCoder.getSchema(), myCoder.reflectData.get());
        }
      };
}
 
Example #29
Source File: AvroFieldsPickConverterTest.java    From incubator-gobblin with Apache License 2.0 5 votes vote down vote up
@Test
public void testFieldsPickWithNestedRecord() throws Exception {
  Schema inputSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/pickfields_nested_with_union.avsc"));

  WorkUnitState workUnitState = new WorkUnitState();
  workUnitState.setProp(ConfigurationKeys.CONVERTER_AVRO_FIELD_PICK_FIELDS, "name,favorite_number,nested1.nested1_string,nested1.nested2_union.nested2_string");

  try (AvroFieldsPickConverter converter = new AvroFieldsPickConverter()) {
    Schema convertedSchema = converter.convertSchema(inputSchema, workUnitState);
    Schema expectedSchema = new Schema.Parser().parse(getClass().getResourceAsStream("/converter/converted_pickfields_nested_with_union.avsc"));
    JSONAssert.assertEquals(expectedSchema.toString(), convertedSchema.toString(), false);

    try (DataFileReader<GenericRecord> srcDataFileReader = new DataFileReader<GenericRecord>(
            new File(getClass().getResource("/converter/pickfields_nested_with_union.avro").toURI()),
                new GenericDatumReader<GenericRecord>(inputSchema));
        DataFileReader<GenericRecord> expectedDataFileReader = new DataFileReader<GenericRecord>(
            new File(getClass().getResource("/converter/converted_pickfields_nested_with_union.avro").toURI()),
                new GenericDatumReader<GenericRecord>(expectedSchema));) {

      while (expectedDataFileReader.hasNext()) {
        GenericRecord expected = expectedDataFileReader.next();
        GenericRecord actual = converter.convertRecord(convertedSchema, srcDataFileReader.next(), workUnitState).iterator().next();
        Assert.assertEquals(actual, expected);
      }
      Assert.assertTrue(!srcDataFileReader.hasNext());
    }
  }
}
 
Example #30
Source File: KinesisInputRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@DoFn.ProcessElement
public void processElement(ProcessContext c) throws IOException {
    if (schema == null) {
        schema = new Schema.Parser().parse(schemaStr);
        datumReader = new GenericDatumReader<GenericRecord>(schema);
    }
    decoder = DecoderFactory.get().binaryDecoder(c.element().getDataAsBytes(), decoder);
    GenericRecord record = datumReader.read(null, decoder);
    c.output(record);
}