org.apache.beam.sdk.transforms.Values Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.Values. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CreateStreamTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testElementsAtAlmostPositiveInfinity() throws IOException {
  Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp();
  CreateStream<String> source =
      CreateStream.of(StringUtf8Coder.of(), batchDuration())
          .nextBatch(
              TimestampedValue.of("foo", endOfGlobalWindow),
              TimestampedValue.of("bar", endOfGlobalWindow))
          .advanceNextBatchWatermarkToInfinity();

  FixedWindows windows = FixedWindows.of(Duration.standardHours(6));
  PCollection<String> windowedValues =
      p.apply(source)
          .apply(Window.into(windows))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(Flatten.iterables());

  PAssert.that(windowedValues)
      .inWindow(windows.assignWindow(GlobalWindow.INSTANCE.maxTimestamp()))
      .containsInAnyOrder("foo", "bar");
  p.run();
}
 
Example #2
Source File: BigQueryMerger.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Void> expand(PCollection<MergeInfo> input) {
  final MergeStatementBuilder mergeBuilder = new MergeStatementBuilder(mergeConfiguration);
  return input
      .apply(
          MapElements.into(
              TypeDescriptors.kvs(
                  TypeDescriptors.strings(), TypeDescriptor.of(MergeInfo.class)))
              .via(mergeInfo -> KV.of(mergeInfo.getReplicaTable(), mergeInfo)))
      .apply(new TriggerPerKeyOnFixedIntervals<String, MergeInfo>(windowDuration))
      .apply(Values.create())
      .apply(MapElements.into(TypeDescriptors.strings()).via(mergeInfo -> {
        return mergeBuilder.buildMergeStatement(
            mergeInfo.getReplicaTable(),
            mergeInfo.getStagingTable(),
            mergeInfo.getAllPkFields(),
            mergeInfo.getAllFields());
      }))
      .apply(ParDo.of(new BigQueryStatementIssuingFn(this.testBigQueryClient)))
      .apply(
          MapElements.into(TypeDescriptors.voids())
              .via(
                  whatever ->
                      (Void) null)); // TODO(pabloem) Remove this line and find a return type
}
 
Example #3
Source File: ApproximateDistinctTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void perKey() {
  final int cardinality = 1000;
  final int p = 15;
  final double expectedErr = 1.04 / Math.sqrt(p);

  List<Integer> stream = new ArrayList<>();
  for (int i = 1; i <= cardinality; i++) {
    stream.addAll(Collections.nCopies(2, i));
  }
  Collections.shuffle(stream);

  PCollection<Long> results =
      tp.apply("per key stream", Create.of(stream))
          .apply("create keys", WithKeys.of(1))
          .apply(
              "per key cardinality",
              ApproximateDistinct.<Integer, Integer>perKey().withPrecision(p))
          .apply("extract values", Values.create());

  PAssert.that("Verify Accuracy for cardinality per key", results)
      .satisfies(new VerifyAccuracy(cardinality, expectedErr));

  tp.run();
}
 
Example #4
Source File: TestExpansionService.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PTransform<PCollection<GenericRecord>, PCollection<String>> buildExternal(
    StringConfiguration configuration) {
  return new PTransform<PCollection<GenericRecord>, PCollection<String>>() {
    @Override
    public PCollection<String> expand(PCollection<GenericRecord> input) {
      return input
          .apply(
              FileIO.<GenericRecord>write()
                  .via(ParquetIO.sink(schema))
                  .to(configuration.data))
          .getPerDestinationOutputFilenames()
          .apply(Values.create());
    }
  };
}
 
Example #5
Source File: FileIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<MatchResult.Metadata> expand(PCollection<String> input) {
  PCollection<MatchResult.Metadata> res;
  if (getConfiguration().getWatchInterval() == null) {
    res =
        input.apply(
            "Match filepatterns",
            ParDo.of(new MatchFn(getConfiguration().getEmptyMatchTreatment())));
  } else {
    res =
        input
            .apply(
                "Continuously match filepatterns",
                Watch.growthOf(
                        Contextful.of(new MatchPollFn(), Requirements.empty()),
                        new ExtractFilenameFn())
                    .withPollInterval(getConfiguration().getWatchInterval())
                    .withTerminationPerInput(getConfiguration().getWatchTerminationCondition()))
            .apply(Values.create());
  }
  return res.apply(Reshuffle.viaRandomKey());
}
 
Example #6
Source File: GatherAllPanes.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Iterable<ValueInSingleWindow<T>>> expand(PCollection<T> input) {
  WindowFn<?, ?> originalWindowFn = input.getWindowingStrategy().getWindowFn();

  return input
      .apply(Reify.windows())
      .apply(
          WithKeys.<Integer, ValueInSingleWindow<T>>of(0)
              .withKeyType(new TypeDescriptor<Integer>() {}))
      .apply(
          Window.into(
                  new IdentityWindowFn<KV<Integer, ValueInSingleWindow<T>>>(
                      originalWindowFn.windowCoder()))
              .triggering(Never.ever())
              .withAllowedLateness(input.getWindowingStrategy().getAllowedLateness())
              .discardingFiredPanes())
      // all values have the same key so they all appear as a single output element
      .apply(GroupByKey.create())
      .apply(Values.create())
      .setWindowingStrategyInternal(input.getWindowingStrategy());
}
 
Example #7
Source File: WriteFiles.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<List<ResultT>> expand(PCollection<ResultT> input) {
  if (getWindowedWrites()) {
    // Reshuffle the results to make them stable against retries.
    // Use a single void key to maximize size of bundles for finalization.
    return input
        .apply("Add void key", WithKeys.of((Void) null))
        .apply("Reshuffle", Reshuffle.of())
        .apply("Drop key", Values.create())
        .apply("Gather bundles", ParDo.of(new GatherBundlesPerWindowFn<>()))
        .setCoder(ListCoder.of(resultCoder))
        // Reshuffle one more time to stabilize the contents of the bundle lists to finalize.
        .apply(Reshuffle.viaRandomKey());
  } else {
    // Pass results via a side input rather than reshuffle, because we need to get an empty
    // iterable to finalize if there are no results.
    return input
        .getPipeline()
        .apply(Reify.viewInGlobalWindow(input.apply(View.asList()), ListCoder.of(resultCoder)));
  }
}
 
Example #8
Source File: TestStreamTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category({NeedsRunner.class, UsesTestStream.class})
public void testElementsAtAlmostPositiveInfinity() {
  Instant endOfGlobalWindow = GlobalWindow.INSTANCE.maxTimestamp();
  TestStream<String> stream =
      TestStream.create(StringUtf8Coder.of())
          .addElements(
              TimestampedValue.of("foo", endOfGlobalWindow),
              TimestampedValue.of("bar", endOfGlobalWindow))
          .advanceWatermarkToInfinity();

  FixedWindows windows = FixedWindows.of(Duration.standardHours(6));
  PCollection<String> windowedValues =
      p.apply(stream)
          .apply(into(windows))
          .apply(WithKeys.of(1))
          .apply(GroupByKey.create())
          .apply(Values.create())
          .apply(Flatten.iterables());

  PAssert.that(windowedValues)
      .inWindow(windows.assignWindow(endOfGlobalWindow))
      .containsInAnyOrder("foo", "bar");
  p.run();
}
 
Example #9
Source File: BigQueryToTableIT.java    From beam with Apache License 2.0 6 votes vote down vote up
private void runBigQueryToTablePipeline(BigQueryToTableOptions options) {
  Pipeline p = Pipeline.create(options);
  BigQueryIO.Read bigQueryRead = BigQueryIO.read().fromQuery(options.getQuery());
  if (options.getUsingStandardSql()) {
    bigQueryRead = bigQueryRead.usingStandardSql();
  }
  PCollection<TableRow> input = p.apply(bigQueryRead);
  if (options.getReshuffle()) {
    input =
        input
            .apply(WithKeys.<Void, TableRow>of((Void) null))
            .setCoder(KvCoder.of(VoidCoder.of(), TableRowJsonCoder.of()))
            .apply(Reshuffle.<Void, TableRow>of())
            .apply(Values.<TableRow>create());
  }
  input.apply(
      BigQueryIO.writeTableRows()
          .to(options.getOutput())
          .withSchema(options.getOutputSchema())
          .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));

  p.run().waitUntilFinish();
}
 
Example #10
Source File: HadoopFormatIOElasticTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Test to read data from embedded Elasticsearch instance and verify whether data is read
 * successfully.
 */
@Test
public void testHifIOWithElastic() {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  String expectedHashCode = "a62a85f5f081e3840baf1028d4d6c6bc";
  Configuration conf = getConfiguration();
  PCollection<KV<Text, LinkedMapWritable>> esData =
      pipeline.apply(HadoopFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
  PCollection<Long> count = esData.apply(Count.globally());
  // Verify that the count of objects fetched using HIFInputFormat IO is correct.
  PAssert.thatSingleton(count).isEqualTo((long) TEST_DATA_ROW_COUNT);
  PCollection<LinkedMapWritable> values = esData.apply(Values.create());
  PCollection<String> textValues = values.apply(transformFunc);
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}
 
Example #11
Source File: HadoopFormatIOCassandraIT.java    From beam with Apache License 2.0 6 votes vote down vote up
/** This test reads data from the Cassandra instance and verifies if data is read successfully. */
@Test
public void testHIFReadForCassandra() {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  String expectedHashCode = "1a30ad400afe4ebf5fde75f5d2d95408";
  Long expectedRecordsCount = 1000L;
  Configuration conf = getConfiguration(options);
  PCollection<KV<Long, String>> cassandraData =
      pipeline.apply(
          HadoopFormatIO.<Long, String>read()
              .withConfiguration(conf)
              .withValueTranslation(myValueTranslate));
  PAssert.thatSingleton(cassandraData.apply("Count", Count.globally()))
      .isEqualTo(expectedRecordsCount);
  PCollection<String> textValues = cassandraData.apply(Values.create());
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}
 
Example #12
Source File: HadoopFormatIOElasticIT.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * This test reads data from the Elasticsearch instance and verifies whether data is read
 * successfully.
 */
@Test
public void testHifIOWithElastic() throws SecurityException {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  final long expectedRowCount = 1000L;
  String expectedHashCode = "42e254c8689050ed0a617ff5e80ea392";
  Configuration conf = getConfiguration(options);
  PCollection<KV<Text, LinkedMapWritable>> esData =
      pipeline.apply(HadoopFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
  // Verify that the count of objects fetched using HIFInputFormat IO is correct.
  PCollection<Long> count = esData.apply(Count.globally());
  PAssert.thatSingleton(count).isEqualTo(expectedRowCount);
  PCollection<LinkedMapWritable> values = esData.apply(Values.create());
  PCollection<String> textValues = values.apply(transformFunc);
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}
 
Example #13
Source File: ParquetIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteAndReadFiles() {
  List<GenericRecord> records = generateGenericRecords(1000);

  PCollection<GenericRecord> writeThenRead =
      mainPipeline
          .apply(Create.of(records).withCoder(AvroCoder.of(SCHEMA)))
          .apply(
              FileIO.<GenericRecord>write()
                  .via(ParquetIO.sink(SCHEMA))
                  .to(temporaryFolder.getRoot().getAbsolutePath()))
          .getPerDestinationOutputFilenames()
          .apply(Values.create())
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(ParquetIO.readFiles(SCHEMA));

  PAssert.that(writeThenRead).containsInAnyOrder(records);

  mainPipeline.run().waitUntilFinish();
}
 
Example #14
Source File: KafkaIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnboundedSourceWithSingleTopic() {
  // same as testUnboundedSource, but with single topic

  int numElements = 1000;
  String topic = "my_topic";

  KafkaIO.Read<Integer, Long> reader =
      KafkaIO.<Integer, Long>read()
          .withBootstrapServers("none")
          .withTopic("my_topic")
          .withConsumerFactoryFn(
              new ConsumerFactoryFn(
                  ImmutableList.of(topic), 10, numElements, OffsetResetStrategy.EARLIEST))
          .withMaxNumRecords(numElements)
          .withKeyDeserializer(IntegerDeserializer.class)
          .withValueDeserializer(LongDeserializer.class);

  PCollection<Long> input = p.apply(reader.withoutMetadata()).apply(Values.create());

  addCountingAsserts(input, numElements);
  p.run();
}
 
Example #15
Source File: KafkaIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnboundedSourceWithExplicitPartitions() {
  int numElements = 1000;

  List<String> topics = ImmutableList.of("test");

  KafkaIO.Read<byte[], Long> reader =
      KafkaIO.<byte[], Long>read()
          .withBootstrapServers("none")
          .withTopicPartitions(ImmutableList.of(new TopicPartition("test", 5)))
          .withConsumerFactoryFn(
              new ConsumerFactoryFn(
                  topics, 10, numElements, OffsetResetStrategy.EARLIEST)) // 10 partitions
          .withKeyDeserializer(ByteArrayDeserializer.class)
          .withValueDeserializer(LongDeserializer.class)
          .withMaxNumRecords(numElements / 10);

  PCollection<Long> input = p.apply(reader.withoutMetadata()).apply(Values.create());

  // assert that every element is a multiple of 5.
  PAssert.that(input).satisfies(new AssertMultipleOf(5));

  PAssert.thatSingleton(input.apply(Count.globally())).isEqualTo(numElements / 10L);

  p.run();
}
 
Example #16
Source File: KafkaIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnboundedSourceTimestamps() {

  int numElements = 1000;

  PCollection<Long> input =
      p.apply(mkKafkaReadTransform(numElements, new ValueAsTimestampFn()).withoutMetadata())
          .apply(Values.create());

  addCountingAsserts(input, numElements);

  PCollection<Long> diffs =
      input
          .apply("TimestampDiff", ParDo.of(new ElementValueDiff()))
          .apply("DistinctTimestamps", Distinct.create());
  // This assert also confirms that diffs only has one unique value.
  PAssert.thatSingleton(diffs).isEqualTo(0L);

  p.run();
}
 
Example #17
Source File: KafkaIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnboundedSourceLogAppendTimestamps() {
  // LogAppendTime (server side timestamp) for records is set based on record index
  // in MockConsumer above. Ensure that those exact timestamps are set by the source.
  int numElements = 1000;

  PCollection<Long> input =
      p.apply(mkKafkaReadTransform(numElements, null).withLogAppendTime().withoutMetadata())
          .apply(Values.create());

  addCountingAsserts(input, numElements);

  PCollection<Long> diffs =
      input
          .apply(
              MapElements.into(TypeDescriptors.longs())
                  .via(t -> LOG_APPEND_START_TIME.plus(Duration.standardSeconds(t)).getMillis()))
          .apply("TimestampDiff", ParDo.of(new ElementValueDiff()))
          .apply("DistinctTimestamps", Distinct.create());

  // This assert also confirms that diff only has one unique value.
  PAssert.thatSingleton(diffs).isEqualTo(0L);

  p.run();
}
 
Example #18
Source File: KafkaIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnboundedSourceStartReadTime() {

  assumeTrue(new ConsumerSpEL().hasOffsetsForTimes());

  int numElements = 1000;
  // In this MockConsumer, we let the elements of the time and offset equal and there are 20
  // partitions. So set this startTime can read half elements.
  int startTime = numElements / 20 / 2;
  int maxNumRecords = numElements / 2;

  PCollection<Long> input =
      p.apply(
              mkKafkaReadTransform(numElements, maxNumRecords, new ValueAsTimestampFn())
                  .withStartReadTime(new Instant(startTime))
                  .withoutMetadata())
          .apply(Values.create());

  addCountingAsserts(input, maxNumRecords, maxNumRecords, maxNumRecords, numElements - 1);
  p.run();
}
 
Example #19
Source File: KafkaIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnboundedSourceStartReadTimeException() {

  assumeTrue(new ConsumerSpEL().hasOffsetsForTimes());

  noMessagesException.expect(RuntimeException.class);

  int numElements = 1000;
  // In this MockConsumer, we let the elements of the time and offset equal and there are 20
  // partitions. So set this startTime can not read any element.
  int startTime = numElements / 20;

  p.apply(
          mkKafkaReadTransform(numElements, numElements, new ValueAsTimestampFn())
              .withStartReadTime(new Instant(startTime))
              .withoutMetadata())
      .apply(Values.create());

  p.run();
}
 
Example #20
Source File: HadoopFormatIOCassandraTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Test to read data from embedded Cassandra instance and verify whether data is read
 * successfully.
 */
@Test
public void testHIFReadForCassandra() {
  // Expected hashcode is evaluated during insertion time one time and hardcoded here.
  String expectedHashCode = "1b9780833cce000138b9afa25ba63486";
  Configuration conf = getConfiguration();
  PCollection<KV<Long, String>> cassandraData =
      p.apply(
          HadoopFormatIO.<Long, String>read()
              .withConfiguration(conf)
              .withValueTranslation(myValueTranslate));
  // Verify the count of data retrieved from Cassandra matches expected count.
  PAssert.thatSingleton(cassandraData.apply("Count", Count.globally()))
      .isEqualTo(TEST_DATA_ROW_COUNT);
  PCollection<String> textValues = cassandraData.apply(Values.create());
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  p.run().waitUntilFinish();
}
 
Example #21
Source File: BigQueryMerger.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Void> expand(PCollection<MergeInfo> input) {
  final MergeStatementBuilder mergeBuilder = new MergeStatementBuilder(mergeConfiguration);
  return input
      .apply(
          MapElements.into(
              TypeDescriptors.kvs(
                  TypeDescriptors.strings(), TypeDescriptor.of(MergeInfo.class)))
              .via(mergeInfo -> KV.of(mergeInfo.getReplicaTable(), mergeInfo)))
      .apply(new TriggerPerKeyOnFixedIntervals<String, MergeInfo>(windowDuration))
      .apply(Values.create())
      .apply(MapElements.into(TypeDescriptors.strings()).via(mergeInfo -> {
        return mergeBuilder.buildMergeStatement(
            mergeInfo.getReplicaTable(),
            mergeInfo.getStagingTable(),
            mergeInfo.getAllPkFields(),
            mergeInfo.getAllFields());
      }))
      .apply(ParDo.of(new BigQueryStatementIssuingFn(this.testBigQueryClient)))
      .apply(
          MapElements.into(TypeDescriptors.voids())
              .via(
                  whatever ->
                      (Void) null)); // TODO(pabloem) Remove this line and find a return type
}
 
Example #22
Source File: HadoopFormatIOCassandraIT.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * This test reads data from the Cassandra instance based on query and verifies if data is read
 * successfully.
 */
@Test
public void testHIFReadForCassandraQuery() {
  String expectedHashCode = "7bead6d6385c5f4dd0524720cd320b49";
  Long expectedNumRows = 1L;
  Configuration conf = getConfiguration(options);
  conf.set(
      "cassandra.input.cql",
      "select * from "
          + CASSANDRA_KEYSPACE
          + "."
          + CASSANDRA_TABLE
          + " where token(y_id) > ? and token(y_id) <= ? "
          + "and field0 = 'user48:field0:431531'");
  PCollection<KV<Long, String>> cassandraData =
      pipeline.apply(
          HadoopFormatIO.<Long, String>read()
              .withConfiguration(conf)
              .withValueTranslation(myValueTranslate));
  PAssert.thatSingleton(cassandraData.apply("Count", Count.globally()))
      .isEqualTo(expectedNumRows);
  PCollection<String> textValues = cassandraData.apply(Values.create());
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}
 
Example #23
Source File: KafkaIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnboundedSourceSplits() throws Exception {

  int numElements = 1000;
  int numSplits = 10;

  // Coders must be specified explicitly here due to the way the transform
  // is used in the test.
  UnboundedSource<KafkaRecord<Integer, Long>, ?> initial =
      mkKafkaReadTransform(numElements, null)
          .withKeyDeserializerAndCoder(IntegerDeserializer.class, BigEndianIntegerCoder.of())
          .withValueDeserializerAndCoder(LongDeserializer.class, BigEndianLongCoder.of())
          .makeSource();

  List<? extends UnboundedSource<KafkaRecord<Integer, Long>, ?>> splits =
      initial.split(numSplits, p.getOptions());
  assertEquals("Expected exact splitting", numSplits, splits.size());

  long elementsPerSplit = numElements / numSplits;
  assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits);
  PCollectionList<Long> pcollections = PCollectionList.empty(p);
  for (int i = 0; i < splits.size(); ++i) {
    pcollections =
        pcollections.and(
            p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit))
                .apply("Remove Metadata " + i, ParDo.of(new RemoveKafkaMetadata<>()))
                .apply("collection " + i, Values.create()));
  }
  PCollection<Long> input = pcollections.apply(Flatten.pCollections());

  addCountingAsserts(input, numElements);
  p.run();
}
 
Example #24
Source File: KafkaIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnboundedSourceCustomTimestamps() {
  // The custom timestamps is set to customTimestampStartMillis + value.
  // Tests basic functionality of custom timestamps.

  final int numElements = 1000;
  final long customTimestampStartMillis = 80000L;

  PCollection<Long> input =
      p.apply(
              mkKafkaReadTransform(numElements, null)
                  .withTimestampPolicyFactory(
                      (tp, prevWatermark) ->
                          new CustomTimestampPolicyWithLimitedDelay<Integer, Long>(
                              (record ->
                                  new Instant(
                                      TimeUnit.SECONDS.toMillis(record.getKV().getValue())
                                          + customTimestampStartMillis)),
                              Duration.ZERO,
                              prevWatermark))
                  .withoutMetadata())
          .apply(Values.create());

  addCountingAsserts(input, numElements);

  PCollection<Long> diffs =
      input
          .apply(
              MapElements.into(TypeDescriptors.longs())
                  .via(t -> TimeUnit.SECONDS.toMillis(t) + customTimestampStartMillis))
          .apply("TimestampDiff", ParDo.of(new ElementValueDiff()))
          .apply("DistinctTimestamps", Distinct.create());

  // This assert also confirms that diff only has one unique value.
  PAssert.thatSingleton(diffs).isEqualTo(0L);

  p.run();
}
 
Example #25
Source File: KafkaIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnboundedSource() {
  int numElements = 1000;

  PCollection<Long> input =
      p.apply(mkKafkaReadTransform(numElements, new ValueAsTimestampFn()).withoutMetadata())
          .apply(Values.create());

  addCountingAsserts(input, numElements);
  p.run();
}
 
Example #26
Source File: KafkaIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testValuesSink() throws Exception {
  // similar to testSink(), but use values()' interface.

  int numElements = 1000;

  try (MockProducerWrapper producerWrapper = new MockProducerWrapper()) {

    ProducerSendCompletionThread completionThread =
        new ProducerSendCompletionThread(producerWrapper.mockProducer).start();

    String topic = "test";

    p.apply(mkKafkaReadTransform(numElements, new ValueAsTimestampFn()).withoutMetadata())
        .apply(Values.create()) // there are no keys
        .apply(
            KafkaIO.<Integer, Long>write()
                .withBootstrapServers("none")
                .withTopic(topic)
                .withValueSerializer(LongSerializer.class)
                .withProducerFactoryFn(new ProducerFactoryFn(producerWrapper.producerKey))
                .values());

    p.run();

    completionThread.shutdown();

    verifyProducerRecords(producerWrapper.mockProducer, topic, numElements, true, false);
  }
}
 
Example #27
Source File: HadoopFormatIOElasticIT.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * This test reads data from the Elasticsearch instance based on a query and verifies if data is
 * read successfully.
 */
@Test
public void testHifIOWithElasticQuery() {
  String expectedHashCode = "d7a7e4e42c2ca7b83ef7c1ad1ebce000";
  Long expectedRecordsCount = 1L;
  Configuration conf = getConfiguration(options);
  String query =
      "{"
          + "  \"query\": {"
          + "  \"match\" : {"
          + "    \"Title\" : {"
          + "      \"query\" : \"Title9\","
          + "      \"type\" : \"boolean\""
          + "    }"
          + "  }"
          + "  }"
          + "}";
  conf.set(ConfigurationOptions.ES_QUERY, query);
  PCollection<KV<Text, LinkedMapWritable>> esData =
      pipeline.apply(HadoopFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf));
  PCollection<Long> count = esData.apply(Count.globally());
  // Verify that the count of objects fetched using HIFInputFormat IO is correct.
  PAssert.thatSingleton(count).isEqualTo(expectedRecordsCount);
  PCollection<LinkedMapWritable> values = esData.apply(Values.create());
  PCollection<String> textValues = values.apply(transformFunc);
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode);
  pipeline.run().waitUntilFinish();
}
 
Example #28
Source File: HadoopFormatIOCassandraTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Test to read data from embedded Cassandra instance based on query and verify whether data is
 * read successfully.
 */
@Test
public void testHIFReadForCassandraQuery() {
  Long expectedCount = 1L;
  String expectedChecksum = "f11caabc7a9fc170e22b41218749166c";
  Configuration conf = getConfiguration();
  conf.set(
      "cassandra.input.cql",
      "select * from "
          + CASSANDRA_KEYSPACE
          + "."
          + CASSANDRA_TABLE
          + " where token(id) > ? and token(id) <= ? and scientist='Faraday1' allow filtering");
  PCollection<KV<Long, String>> cassandraData =
      p.apply(
          HadoopFormatIO.<Long, String>read()
              .withConfiguration(conf)
              .withValueTranslation(myValueTranslate));
  // Verify the count of data retrieved from Cassandra matches expected count.
  PAssert.thatSingleton(cassandraData.apply("Count", Count.globally())).isEqualTo(expectedCount);
  PCollection<String> textValues = cassandraData.apply(Values.create());
  // Verify the output values using checksum comparison.
  PCollection<String> consolidatedHashcode =
      textValues.apply(Combine.globally(new HashingFn()).withoutDefaults());
  PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedChecksum);
  p.run().waitUntilFinish();
}
 
Example #29
Source File: SplittableParDoNaiveBounded.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollectionTuple expand(PCollection<KV<byte[], KV<InputT, RestrictionT>>> input) {
  return input
      .apply("Drop key", Values.create())
      .apply("Reshuffle", Reshuffle.of())
      .apply(
          "NaiveProcess",
          ParDo.of(
                  new NaiveProcessFn<
                      InputT, OutputT, RestrictionT, PositionT, WatermarkEstimatorStateT>(
                      original.getFn()))
              .withSideInputs(original.getSideInputs())
              .withOutputTags(original.getMainOutputTag(), original.getAdditionalOutputTags()));
}
 
Example #30
Source File: KafkaIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnreachableKafkaBrokers() {
  // Expect an exception when the Kafka brokers are not reachable on the workers.
  // We specify partitions explicitly so that splitting does not involve server interaction.
  // Set request timeout to 10ms so that test does not take long.

  thrown.expect(Exception.class);
  thrown.expectMessage("Reader-0: Timeout while initializing partition 'test-0'");

  int numElements = 1000;
  PCollection<Long> input =
      p.apply(
              KafkaIO.<Integer, Long>read()
                  .withBootstrapServers("8.8.8.8:9092") // Google public DNS ip.
                  .withTopicPartitions(ImmutableList.of(new TopicPartition("test", 0)))
                  .withKeyDeserializer(IntegerDeserializer.class)
                  .withValueDeserializer(LongDeserializer.class)
                  .withConsumerConfigUpdates(
                      ImmutableMap.of(
                          ConsumerConfig.HEARTBEAT_INTERVAL_MS_CONFIG,
                          5,
                          ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG,
                          8,
                          ConsumerConfig.FETCH_MAX_WAIT_MS_CONFIG,
                          8,
                          "default.api.timeout.ms",
                          10))
                  .withMaxNumRecords(10)
                  .withoutMetadata())
          .apply(Values.create());

  addCountingAsserts(input, numElements);
  p.run();
}