org.apache.beam.sdk.coders.AvroCoder Java Examples

The following examples show how to use org.apache.beam.sdk.coders.AvroCoder. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testSchemaStringIsInterned() throws Exception {
  List<Bird> birds = createRandomRecords(100);
  String filename =
      generateTestFile(
          "tmp.avro",
          birds,
          SyncBehavior.SYNC_DEFAULT,
          0,
          AvroCoder.of(Bird.class),
          DataFileConstants.NULL_CODEC);
  Metadata fileMetadata = FileSystems.matchSingleFileSpec(filename);
  String schema = AvroSource.readMetadataFromFile(fileMetadata.resourceId()).getSchemaString();
  // Add "" to the schema to make sure it is not interned.
  AvroSource<GenericRecord> sourceA = AvroSource.from(filename).withSchema("" + schema);
  AvroSource<GenericRecord> sourceB = AvroSource.from(filename).withSchema("" + schema);
  assertSame(sourceA.getReaderSchemaString(), sourceB.getReaderSchemaString());

  // Ensure that deserialization still goes through interning
  AvroSource<GenericRecord> sourceC = SerializableUtils.clone(sourceB);
  assertSame(sourceA.getReaderSchemaString(), sourceC.getReaderSchemaString());
}
 
Example #2
Source File: WriteToGCSParquet.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public WriteFilesResult<Void> expand(PCollection<KV<String, String>> kafkaRecords) {
  return kafkaRecords
      /*
       * Converting KV<String, String> records to GenericRecord using DoFn and {@link
       * KeyValueToGenericRecordFn} class.
       */
      .apply("Create GenericRecord(s)", ParDo.of(new KeyValueToGenericRecordFn()))
      .setCoder(AvroCoder.of(GenericRecord.class, KeyValueToGenericRecordFn.SCHEMA))
      /*
       * Writing as parquet file using {@link FileIO} and {@link ParquetIO}.
       *
       * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file.
       * The {@link withNumShards} option specifies the number of shards passed by the user.
       */
      .apply(
          "Writing as Parquet",
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(KeyValueToGenericRecordFn.SCHEMA))
              .to(outputDirectory())
              .withPrefix(outputFilenamePrefix())
              .withSuffix(
                  WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.PARQUET))
              .withNumShards(numShards()));
}
 
Example #3
Source File: ConfluentSchemaRegistryDeserializerProviderTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testGetCoder() {
  String schemaRegistryUrl = "mock://my-scope-name";
  String subject = "mytopic";
  SchemaRegistryClient mockRegistryClient = mockSchemaRegistryClient(schemaRegistryUrl, subject);
  CoderRegistry coderRegistry = CoderRegistry.createDefault();

  AvroCoder coderV0 =
      (AvroCoder)
          mockDeserializerProvider(schemaRegistryUrl, subject, null).getCoder(coderRegistry);
  assertEquals(AVRO_SCHEMA, coderV0.getSchema());

  try {
    Integer version = mockRegistryClient.register(subject, AVRO_SCHEMA_V1);
    AvroCoder coderV1 =
        (AvroCoder)
            mockDeserializerProvider(schemaRegistryUrl, subject, version).getCoder(coderRegistry);
    assertEquals(AVRO_SCHEMA_V1, coderV1.getSchema());
  } catch (IOException | RestClientException e) {
    throw new RuntimeException("Unable to register schema for subject: " + subject, e);
  }
}
 
Example #4
Source File: HadoopFormatIOReadTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * This test verifies that the method {@link
 * HadoopInputFormatBoundedSource.HadoopInputFormatReader#getCurrentSource() getCurrentSource()}
 * returns correct source object.
 */
@Test
public void testGetCurrentSourceFunction() throws Exception {
  SerializableSplit split = new SerializableSplit();
  BoundedSource<KV<Text, Employee>> source =
      new HadoopInputFormatBoundedSource<>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          split);
  BoundedReader<KV<Text, Employee>> hifReader = source.createReader(p.getOptions());
  BoundedSource<KV<Text, Employee>> hifSource = hifReader.getCurrentSource();
  assertEquals(hifSource, source);
}
 
Example #5
Source File: PubsubIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testAvroGenericRecords() {
  AvroCoder<GenericRecord> coder = AvroCoder.of(GenericRecord.class, SCHEMA);
  List<GenericRecord> inputs =
      ImmutableList.of(
          new AvroGeneratedUser("Bob", 256, null),
          new AvroGeneratedUser("Alice", 128, null),
          new AvroGeneratedUser("Ted", null, "white"));
  setupTestClient(inputs, coder);
  PCollection<GenericRecord> read =
      readPipeline.apply(
          PubsubIO.readAvroGenericRecords(SCHEMA)
              .fromSubscription(SUBSCRIPTION.getPath())
              .withClock(CLOCK)
              .withClientFactory(clientFactory));
  PAssert.that(read).containsInAnyOrder(inputs);
  readPipeline.run();
}
 
Example #6
Source File: HadoopFormatIOReadTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * This test validates behavior of HadoopInputFormatSource if {@link
 * InputFormat#createRecordReader(InputSplit, TaskAttemptContext)} createRecordReader(InputSplit,
 * TaskAttemptContext)} of InputFormat returns null.
 */
@Test
public void testReadWithNullCreateRecordReader() throws Exception {
  InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
  thrown.expect(IOException.class);
  thrown.expectMessage(
      String.format("Null RecordReader object returned by %s", mockInputFormat.getClass()));
  Mockito.when(
          mockInputFormat.createRecordReader(
              Mockito.any(InputSplit.class), Mockito.any(TaskAttemptContext.class)))
      .thenReturn(null);
  HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
      new HadoopInputFormatBoundedSource<>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          new SerializableSplit());
  boundedSource.setInputFormatObj(mockInputFormat);
  SourceTestUtils.readFromSource(boundedSource, p.getOptions());
}
 
Example #7
Source File: HadoopFormatIOReadTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * This test validates records emitted in PCollection are immutable if InputFormat's {@link
 * org.apache.hadoop.mapreduce.RecordReader RecordReader} returns different objects (i.e.
 * different locations in memory).
 */
@Test
public void testImmutablityOfOutputOfReadIfRecordReaderObjectsAreImmutable() throws Exception {
  List<BoundedSource<KV<Text, Employee>>> boundedSourceList =
      getBoundedSourceList(
          EmployeeInputFormat.class,
          Text.class,
          Employee.class,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class));
  List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
  for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
    List<KV<Text, Employee>> elems = SourceTestUtils.readFromSource(source, p.getOptions());
    bundleRecords.addAll(elems);
  }
  List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
  assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
}
 
Example #8
Source File: CsvToAvro.java    From java-docs-samples with Apache License 2.0 6 votes vote down vote up
public static void runCsvToAvro(SampleOptions options)
    throws IOException, IllegalArgumentException {
  FileSystems.setDefaultPipelineOptions(options);

  // Get Avro Schema
  String schemaJson = getSchema(options.getAvroSchema());
  Schema schema = new Schema.Parser().parse(schemaJson);

  // Check schema field types before starting the Dataflow job
  checkFieldTypes(schema);

  // Create the Pipeline object with the options we defined above.
  Pipeline pipeline = Pipeline.create(options);

  // Convert CSV to Avro
  pipeline.apply("Read CSV files", TextIO.read().from(options.getInputFile()))
      .apply("Convert CSV to Avro formatted data",
          ParDo.of(new ConvertCsvToAvro(schemaJson, options.getCsvDelimiter())))
      .setCoder(AvroCoder.of(GenericRecord.class, schema))
      .apply("Write Avro formatted data", AvroIO.writeGenericRecords(schemaJson)
          .to(options.getOutput()).withCodec(CodecFactory.snappyCodec()).withSuffix(".avro"));

  // Run the pipeline.
  pipeline.run().waitUntilFinish();
}
 
Example #9
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Tests {@link CsvConverters.StringToGenericRecordFn} throws an exception if incorrect header
 * information is provided. (for e.g. if a Csv file containing headers is passed and hasHeaders is
 * set to false.)
 */
@Test(expected = RuntimeException.class)
public void testIncorrectHeaderInformation() {
  Schema schema = SchemaUtils.getAvroSchema(TEST_AVRO_SCHEMA_PATH);

  pipeline
      .apply(
          "TestIncorrectHeaderInformation",
          CsvConverters.ReadCsv.newBuilder()
              .setInputFileSpec(HEADER_CSV_FILE_PATH)
              .setHasHeaders(false)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setCsvFormat("Default")
              .setDelimiter(",")
              .build())
      .get(CSV_LINES)
      .apply(
          "ConvertStringToGenericRecord",
          ParDo.of(new CsvConverters.StringToGenericRecordFn(TEST_AVRO_SCHEMA_PATH, ",")))
      .setCoder(AvroCoder.of(GenericRecord.class, schema));

  pipeline.run();
}
 
Example #10
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
/**
 * Test if {@link CsvConverters.StringToGenericRecordFn} throws an exception if the number of Csv
 * headers is less than the number of fields in Avro schema.
 */
@Test(expected = RuntimeException.class)
public void testIncorrectFieldSize() {
  Schema schema = SchemaUtils.getAvroSchema(TEST_AVRO_SCHEMA_TWO_PATH);

  pipeline
      .apply(
          "TestIncorrectFieldSize",
          CsvConverters.ReadCsv.newBuilder()
              .setInputFileSpec(HEADER_CSV_FILE_PATH)
              .setHasHeaders(true)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setCsvFormat("Default")
              .setDelimiter(",")
              .build())
      .get(CSV_LINES)
      .apply(
          "ConvertStringToGenericRecord",
          ParDo.of(new CsvConverters.StringToGenericRecordFn(TEST_AVRO_SCHEMA_TWO_PATH, ",")))
      .setCoder(AvroCoder.of(GenericRecord.class, schema));

  pipeline.run();
}
 
Example #11
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateFromMetadata() throws Exception {
  List<Bird> expected = createRandomRecords(DEFAULT_RECORD_COUNT);
  String codec = DataFileConstants.NULL_CODEC;
  String filename =
      generateTestFile(
          codec, expected, SyncBehavior.SYNC_DEFAULT, 0, AvroCoder.of(Bird.class), codec);
  Metadata fileMeta = FileSystems.matchSingleFileSpec(filename);

  AvroSource<GenericRecord> source = AvroSource.from(fileMeta);
  AvroSource<Bird> sourceWithSchema = source.withSchema(Bird.class);
  AvroSource<Bird> sourceWithSchemaWithMinBundleSize = sourceWithSchema.withMinBundleSize(1234);

  assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, source.getMode());
  assertEquals(FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchema.getMode());
  assertEquals(
      FileBasedSource.Mode.SINGLE_FILE_OR_SUBRANGE, sourceWithSchemaWithMinBundleSize.getMode());
}
 
Example #12
Source File: HadoopFormatIOReadTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Test reading if InputFormat implements {@link org.apache.hadoop.conf.Configurable
 * Configurable}.
 */
@Test
public void testReadingWithConfigurableInputFormat() throws Exception {
  List<BoundedSource<KV<Text, Employee>>> boundedSourceList =
      getBoundedSourceList(
          ConfigurableEmployeeInputFormat.class,
          Text.class,
          Employee.class,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class));
  for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
    // Cast to HadoopInputFormatBoundedSource to access getInputFormat().
    HadoopInputFormatBoundedSource<Text, Employee> hifSource =
        (HadoopInputFormatBoundedSource<Text, Employee>) source;
    hifSource.createInputFormatInstance();
    ConfigurableEmployeeInputFormat inputFormatObj =
        (ConfigurableEmployeeInputFormat) hifSource.getInputFormat();
    assertTrue(inputFormatObj.isConfSet);
  }
}
 
Example #13
Source File: HadoopFormatIOReadTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * This test validates behavior of {@link HadoopInputFormatBoundedSource} if RecordReader object
 * creation fails.
 */
@Test
public void testReadIfCreateRecordReaderFails() throws Exception {
  thrown.expect(Exception.class);
  thrown.expectMessage("Exception in creating RecordReader");
  InputFormat<Text, Employee> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
  Mockito.when(
          mockInputFormat.createRecordReader(
              Mockito.any(InputSplit.class), Mockito.any(TaskAttemptContext.class)))
      .thenThrow(new IOException("Exception in creating RecordReader"));
  HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
      new HadoopInputFormatBoundedSource<>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          new SerializableSplit());
  boundedSource.setInputFormatObj(mockInputFormat);
  SourceTestUtils.readFromSource(boundedSource, p.getOptions());
}
 
Example #14
Source File: ParquetIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteAndReadUsingReflectDataSchemaWithDataModel() {
  Schema testRecordSchema = ReflectData.get().getSchema(TestRecord.class);

  List<GenericRecord> records = generateGenericRecords(1000);
  mainPipeline
      .apply(Create.of(records).withCoder(AvroCoder.of(testRecordSchema)))
      .apply(
          FileIO.<GenericRecord>write()
              .via(ParquetIO.sink(testRecordSchema))
              .to(temporaryFolder.getRoot().getAbsolutePath()));
  mainPipeline.run().waitUntilFinish();

  PCollection<GenericRecord> readBack =
      readPipeline.apply(
          ParquetIO.read(testRecordSchema)
              .withAvroDataModel(GenericData.get())
              .from(temporaryFolder.getRoot().getAbsolutePath() + "/*"));

  PAssert.that(readBack).containsInAnyOrder(records);
  readPipeline.run().waitUntilFinish();
}
 
Example #15
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreationWithSchema() throws Exception {
  List<Bird> expected = createRandomRecords(100);
  String filename =
      generateTestFile(
          "tmp.avro",
          expected,
          SyncBehavior.SYNC_DEFAULT,
          0,
          AvroCoder.of(Bird.class),
          DataFileConstants.NULL_CODEC);

  // Create a source with a schema object
  Schema schema = ReflectData.get().getSchema(Bird.class);
  AvroSource<GenericRecord> source = AvroSource.from(filename).withSchema(schema);
  List<GenericRecord> records = SourceTestUtils.readFromSource(source, null);
  assertEqualsWithGeneric(expected, records);

  // Create a source with a JSON schema
  String schemaString = ReflectData.get().getSchema(Bird.class).toString();
  source = AvroSource.from(filename).withSchema(schemaString);
  records = SourceTestUtils.readFromSource(source, null);
  assertEqualsWithGeneric(expected, records);
}
 
Example #16
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testMultipleFiles() throws Exception {
  String baseName = "tmp-";
  List<Bird> expected = new ArrayList<>();
  for (int i = 0; i < 10; i++) {
    List<Bird> contents = createRandomRecords(DEFAULT_RECORD_COUNT / 10);
    expected.addAll(contents);
    generateTestFile(
        baseName + i,
        contents,
        SyncBehavior.SYNC_DEFAULT,
        0,
        AvroCoder.of(Bird.class),
        DataFileConstants.NULL_CODEC);
  }

  AvroSource<Bird> source =
      AvroSource.from(new File(tmpFolder.getRoot().toString(), baseName + "*").toString())
          .withSchema(Bird.class);
  List<Bird> actual = SourceTestUtils.readFromSource(source, null);
  assertThat(actual, containsInAnyOrder(expected.toArray()));
}
 
Example #17
Source File: PubsubIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testAvroSpecificRecord() {
  AvroCoder<AvroGeneratedUser> coder = AvroCoder.of(AvroGeneratedUser.class);
  List<AvroGeneratedUser> inputs =
      ImmutableList.of(
          new AvroGeneratedUser("Bob", 256, null),
          new AvroGeneratedUser("Alice", 128, null),
          new AvroGeneratedUser("Ted", null, "white"));
  setupTestClient(inputs, coder);
  PCollection<AvroGeneratedUser> read =
      readPipeline.apply(
          PubsubIO.readAvrosWithBeamSchema(AvroGeneratedUser.class)
              .fromSubscription(SUBSCRIPTION.getPath())
              .withClock(CLOCK)
              .withClientFactory(clientFactory));
  PAssert.that(read).containsInAnyOrder(inputs);
  readPipeline.run();
}
 
Example #18
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testGetCurrentFromUnstartedReader() throws Exception {
  List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT);
  String filename =
      generateTestFile(
          "tmp.avro",
          records,
          SyncBehavior.SYNC_DEFAULT,
          1000,
          AvroCoder.of(FixedRecord.class),
          DataFileConstants.NULL_CODEC);

  AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
  try (BlockBasedSource.BlockBasedReader<FixedRecord> reader =
      (BlockBasedSource.BlockBasedReader<FixedRecord>) source.createReader(null)) {
    assertEquals(null, reader.getCurrentBlock());

    expectedException.expect(NoSuchElementException.class);
    expectedException.expectMessage("No block has been successfully read from");
    reader.getCurrent();
  }
}
 
Example #19
Source File: AvroSourceTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testGetProgressFromUnstartedReader() throws Exception {
  List<FixedRecord> records = createFixedRecords(DEFAULT_RECORD_COUNT);
  String filename =
      generateTestFile(
          "tmp.avro",
          records,
          SyncBehavior.SYNC_DEFAULT,
          1000,
          AvroCoder.of(FixedRecord.class),
          DataFileConstants.NULL_CODEC);
  File file = new File(filename);

  AvroSource<FixedRecord> source = AvroSource.from(filename).withSchema(FixedRecord.class);
  try (BoundedSource.BoundedReader<FixedRecord> reader = source.createReader(null)) {
    assertEquals(Double.valueOf(0.0), reader.getFractionConsumed());
  }

  List<? extends BoundedSource<FixedRecord>> splits = source.split(file.length() / 3, null);
  for (BoundedSource<FixedRecord> subSource : splits) {
    try (BoundedSource.BoundedReader<FixedRecord> reader = subSource.createReader(null)) {
      assertEquals(Double.valueOf(0.0), reader.getFractionConsumed());
    }
  }
}
 
Example #20
Source File: SnowflakeIOReadTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testConfigContainsFromQueryAndFromTable() {
  thrown.expect(IllegalArgumentException.class);
  thrown.expectMessage("fromTable() and fromQuery() are not allowed together");

  pipeline.apply(
      SnowflakeIO.<GenericRecord>read(snowflakeService)
          .withDataSourceConfiguration(dataSourceConfiguration)
          .fromQuery("")
          .fromTable(FAKE_TABLE)
          .withStagingBucketName(options.getStagingBucketName())
          .withStorageIntegrationName(options.getStorageIntegrationName())
          .withCsvMapper(getCsvMapper())
          .withCoder(AvroCoder.of(AvroGeneratedUser.getClassSchema())));

  pipeline.run();
}
 
Example #21
Source File: HadoopFormatIOReadTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * This test validates behavior of {@link
 * HadoopInputFormatBoundedSource#computeSplitsIfNecessary() computeSplits()} when Hadoop
 * InputFormat's {@link InputFormat#getSplits(JobContext)} returns empty list.
 */
@Test
public void testComputeSplitsIfGetSplitsReturnsEmptyList() throws Exception {
  InputFormat<?, ?> mockInputFormat = Mockito.mock(EmployeeInputFormat.class);
  SerializableSplit mockInputSplit = Mockito.mock(SerializableSplit.class);
  Mockito.when(mockInputFormat.getSplits(Mockito.any(JobContext.class)))
      .thenReturn(new ArrayList<>());
  HadoopInputFormatBoundedSource<Text, Employee> hifSource =
      new HadoopInputFormatBoundedSource<>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          mockInputSplit);
  thrown.expect(IOException.class);
  thrown.expectMessage("Error in computing splits, getSplits() returns a empty list");
  hifSource.setInputFormatObj(mockInputFormat);
  hifSource.computeSplitsIfNecessary();
}
 
Example #22
Source File: PubsubIO.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Returns a {@link PTransform} that continuously reads binary encoded Avro messages of the
 * specific type.
 *
 * <p>Beam will infer a schema for the Avro schema. This allows the output to be used by SQL and
 * by the schema-transform library.
 */
@Experimental(Kind.SCHEMAS)
public static <T> Read<T> readAvrosWithBeamSchema(Class<T> clazz) {
  if (clazz.equals(GenericRecord.class)) {
    throw new IllegalArgumentException("For GenericRecord, please call readAvroGenericRecords");
  }
  org.apache.avro.Schema avroSchema = ReflectData.get().getSchema(clazz);
  AvroCoder<T> coder = AvroCoder.of(clazz);
  Schema schema = AvroUtils.getSchema(clazz, null);
  return Read.newBuilder(parsePayloadUsingCoder(coder))
      .setCoder(
          SchemaCoder.of(
              schema,
              TypeDescriptor.of(clazz),
              AvroUtils.getToRowFunction(clazz, avroSchema),
              AvroUtils.getFromRowFunction(clazz)))
      .build();
}
 
Example #23
Source File: LazyAvroCoder.java    From components with Apache License 2.0 6 votes vote down vote up
@Override
public void encode(Object value, OutputStream outputStream) throws IOException {
    if (converter == null) {
        converter = ConvertToIndexedRecord.getConverter((T) value);
    }
    IndexedRecord ir = converter.convertToAvro((T) value);
    if (internalAvroCoder == null) {
        Schema s = converter.getSchema();
        avroSchemaHolder.put(s);
        @SuppressWarnings("unchecked")
        AvroCoder<IndexedRecord> tCoder = (AvroCoder<IndexedRecord>) (AvroCoder<? extends IndexedRecord>) AvroCoder
                .of(ir.getSchema());
        internalAvroCoder = tCoder;
    }
    LOG.debug("Internal AvroCoder's schema is {}", internalAvroCoder.getSchema());
    LOG.debug("Encode value is {}", value);
    internalAvroCoder.encode(convertToAvro(value), outputStream);
}
 
Example #24
Source File: ApproximateDistinctTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void customObject() {
  final int cardinality = 500;
  final int p = 15;
  final double expectedErr = 1.04 / Math.sqrt(p);

  Schema schema =
      SchemaBuilder.record("User")
          .fields()
          .requiredString("Pseudo")
          .requiredInt("Age")
          .endRecord();
  List<GenericRecord> users = new ArrayList<>();
  for (int i = 1; i <= cardinality; i++) {
    GenericData.Record newRecord = new GenericData.Record(schema);
    newRecord.put("Pseudo", "User" + i);
    newRecord.put("Age", i);
    users.add(newRecord);
  }
  PCollection<Long> results =
      tp.apply("Create stream", Create.of(users).withCoder(AvroCoder.of(schema)))
          .apply(
              "Test custom object",
              ApproximateDistinct.<GenericRecord>globally().withPrecision(p));

  PAssert.that("Verify Accuracy for custom object", results)
      .satisfies(new VerifyAccuracy(cardinality, expectedErr));

  tp.run();
}
 
Example #25
Source File: HadoopFormatIOReadTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/** This test validates that reader and its parent source reads the same records. */
@Test
public void testReaderAndParentSourceReadsSameData() throws Exception {
  InputSplit mockInputSplit = Mockito.mock(NewObjectsEmployeeInputSplit.class);
  HadoopInputFormatBoundedSource<Text, Employee> boundedSource =
      new HadoopInputFormatBoundedSource<>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          new SerializableSplit(mockInputSplit));
  BoundedReader<KV<Text, Employee>> reader = boundedSource.createReader(p.getOptions());
  SourceTestUtils.assertUnstartedReaderReadsSameAsItsSource(reader, p.getOptions());
}
 
Example #26
Source File: AvroIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<FileIO.ReadableFile> input) {
  checkNotNull(getSchema(), "schema");
  PCollection<T> read =
      input.apply(
          "Read all via FileBasedSource",
          new ReadAllViaFileBasedSource<>(
              getDesiredBundleSizeBytes(),
              new CreateSourceFn<>(
                  getRecordClass(), getSchema().toString(), getDatumReaderFactory()),
              AvroCoder.of(getRecordClass(), getSchema())));
  return getInferBeamSchema() ? setBeamSchema(read, getRecordClass(), getSchema()) : read;
}
 
Example #27
Source File: NexmarkUtils.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Setup pipeline with codes and some other options. */
public static void setupPipeline(CoderStrategy coderStrategy, Pipeline p) {
  CoderRegistry registry = p.getCoderRegistry();
  switch (coderStrategy) {
    case HAND:
      registry.registerCoderForClass(Auction.class, Auction.CODER);
      registry.registerCoderForClass(AuctionBid.class, AuctionBid.CODER);
      registry.registerCoderForClass(AuctionCount.class, AuctionCount.CODER);
      registry.registerCoderForClass(AuctionPrice.class, AuctionPrice.CODER);
      registry.registerCoderForClass(Bid.class, Bid.CODER);
      registry.registerCoderForClass(CategoryPrice.class, CategoryPrice.CODER);
      registry.registerCoderForClass(Event.class, Event.CODER);
      registry.registerCoderForClass(IdNameReserve.class, IdNameReserve.CODER);
      registry.registerCoderForClass(NameCityStateId.class, NameCityStateId.CODER);
      registry.registerCoderForClass(Person.class, Person.CODER);
      registry.registerCoderForClass(SellerPrice.class, SellerPrice.CODER);
      registry.registerCoderForClass(Done.class, Done.CODER);
      registry.registerCoderForClass(BidsPerSession.class, BidsPerSession.CODER);
      break;
    case AVRO:
      registry.registerCoderProvider(AvroCoder.getCoderProvider());
      break;
    case JAVA:
      registry.registerCoderProvider(SerializableCoder.getCoderProvider());
      break;
  }
}
 
Example #28
Source File: AvroHdfsFileSource.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
protected KV<AvroKey, NullWritable> nextPair() throws IOException, InterruptedException {
    // Not only is the AvroKey reused by the file format, but the underlying GenericRecord is as well.
    KV<AvroKey, NullWritable> kv = super.nextPair();
    GenericRecord gr = (GenericRecord) kv.getKey().datum();
    gr = CoderUtils.clone(AvroCoder.of(gr.getSchema()), gr);
    return KV.of(new AvroKey(gr), kv.getValue());
}
 
Example #29
Source File: GenericRecordToRowTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testConvertsGenericRecordToRow() {
  String schemaString =
      "{\"namespace\": \"example.avro\",\n"
          + " \"type\": \"record\",\n"
          + " \"name\": \"User\",\n"
          + " \"fields\": [\n"
          + "     {\"name\": \"name\", \"type\": \"string\"},\n"
          + "     {\"name\": \"favorite_number\", \"type\": \"int\"},\n"
          + "     {\"name\": \"favorite_color\", \"type\": \"string\"},\n"
          + "     {\"name\": \"price\", \"type\": \"double\"}\n"
          + " ]\n"
          + "}";
  Schema schema = (new Schema.Parser()).parse(schemaString);

  GenericRecord before = new GenericData.Record(schema);
  before.put("name", "Bob");
  before.put("favorite_number", 256);
  before.put("favorite_color", "red");
  before.put("price", 2.4);

  AvroCoder<GenericRecord> coder = AvroCoder.of(schema);

  PCollection<Row> rows =
      pipeline
          .apply("create PCollection<GenericRecord>", Create.of(before).withCoder(coder))
          .apply(
              "convert", GenericRecordReadConverter.builder().beamSchema(payloadSchema).build());

  PAssert.that(rows)
      .containsInAnyOrder(
          Row.withSchema(payloadSchema).addValues("Bob", 256, "red", 2.4).build());
  pipeline.run();
}
 
Example #30
Source File: WriteToGCSAvro.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<KV<String, String>> kafkaRecords) {
  return kafkaRecords
      /*
       * Converting KV<String, String> records to GenericRecord using DoFn and {@link
       * KeyValueToGenericRecordFn} class.
       */
      .apply("Create GenericRecord(s)", ParDo.of(new KeyValueToGenericRecordFn()))
      .setCoder(AvroCoder.of(GenericRecord.class, KeyValueToGenericRecordFn.SCHEMA))
      /*
       * Writing as avro file using {@link AvroIO}.
       *
       * The {@link WindowedFilenamePolicy} class specifies the file path for writing the file.
       * The {@link withNumShards} option specifies the number of shards passed by the user.
       * The {@link withTempDirectory} option sets the base directory used to generate temporary files.
       */
      .apply(
          "Writing as Avro",
          AvroIO.writeGenericRecords(KeyValueToGenericRecordFn.SCHEMA)
              .to(
                  new WindowedFilenamePolicy(
                      outputDirectory(),
                      outputFilenamePrefix(),
                      WriteToGCSUtility.SHARD_TEMPLATE,
                      WriteToGCSUtility.FILE_SUFFIX_MAP.get(WriteToGCSUtility.FileFormat.AVRO)))
              .withTempDirectory(
                  FileBasedSink.convertToFileResourceIfPossible(tempLocation())
                      .getCurrentDirectory())
              .withWindowedWrites()
              .withNumShards(numShards()));
}