Java Code Examples for org.apache.beam.sdk.transforms.ParDo

The following examples show how to use org.apache.beam.sdk.transforms.ParDo. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: beam   Source File: BatchStatefulParDoOverrides.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<OutputT> expand(PCollection<KV<K, InputT>> input) {
  DoFn<KV<K, InputT>, OutputT> fn = originalParDo.getFn();
  verifyFnIsStateful(fn);
  DataflowRunner.verifyDoFnSupportedBatch(fn);
  DataflowRunner.verifyStateSupportForWindowingStrategy(input.getWindowingStrategy());

  if (isFnApi) {
    return input.apply(Reshuffle.of()).apply(originalParDo);
  }

  PTransform<
          PCollection<? extends KV<K, Iterable<KV<Instant, WindowedValue<KV<K, InputT>>>>>>,
          PCollection<OutputT>>
      statefulParDo =
          ParDo.of(new BatchStatefulDoFn<>(fn)).withSideInputs(originalParDo.getSideInputs());

  return input.apply(new GbkBeforeStatefulParDo<>()).apply(statefulParDo);
}
 
Example 2
Source Project: beam   Source File: BeamSideInputJoinRelTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testInnerJoin_boundedTableOnTheLeftSide() throws Exception {
  String sql =
      "SELECT o1.order_id, o1.sum_site_id, o2.buyer FROM "
          + " ORDER_DETAILS1 o2 "
          + " JOIN "
          + "(select order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS "
          + "          GROUP BY order_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o1 "
          + " on "
          + " o1.order_id=o2.order_id";

  PCollection<Row> rows = compilePipeline(sql, pipeline);
  PAssert.that(rows.apply(ParDo.of(new TestUtils.BeamSqlRow2StringDoFn())))
      .containsInAnyOrder(
          TestUtils.RowsBuilder.of(
                  Schema.FieldType.INT32, "order_id",
                  Schema.FieldType.INT32, "sum_site_id",
                  Schema.FieldType.STRING, "buyer")
              .addRows(1, 3, "james", 2, 5, "bond")
              .getStringRows());
  pipeline.run();
}
 
Example 3
Source Project: beam   Source File: ReadSourceStreamingTest.java    License: Apache License 2.0 6 votes vote down vote up
private static void runProgram(String resultPath) {

    Pipeline p = FlinkTestPipeline.createForStreaming();

    p.apply(GenerateSequence.from(0).to(10))
        .apply(
            ParDo.of(
                new DoFn<Long, String>() {
                  @ProcessElement
                  public void processElement(ProcessContext c) throws Exception {
                    c.output(c.element().toString());
                  }
                }))
        .apply(TextIO.write().to(resultPath));

    p.run();
  }
 
Example 4
Source Project: beam   Source File: SpannerWriteIT.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testWrite() throws Exception {
  int numRecords = 100;
  p.apply(GenerateSequence.from(0).to(numRecords))
      .apply(ParDo.of(new GenerateMutations(options.getTable())))
      .apply(
          SpannerIO.write()
              .withProjectId(project)
              .withInstanceId(options.getInstanceId())
              .withDatabaseId(databaseName));

  PipelineResult result = p.run();
  result.waitUntilFinish();
  assertThat(result.getState(), is(PipelineResult.State.DONE));
  assertThat(countNumberOfRecords(), equalTo((long) numRecords));
}
 
Example 5
@Test
public void testSdkParDoWithoutSideInput() throws Exception {
  Pipeline p = Pipeline.create();
  PCollection<String> pc = p.apply(Create.of("a", "b", "c"));
  pc.apply(ParDo.of(new TestDoFn(null)));
  RunnerApi.Pipeline pipeline = PipelineTranslation.toProto(p);

  Node predecessor = createParDoNode("predecessor");
  Node mainInput = InstructionOutputNode.create(new InstructionOutput(), "fakeId");
  Node sideInputParDo = createParDoNode("noSideInput");

  MutableNetwork<Node, Edge> network = createEmptyNetwork();
  network.addNode(predecessor);
  network.addNode(mainInput);
  network.addNode(sideInputParDo);
  network.addEdge(predecessor, mainInput, DefaultEdge.create());
  network.addEdge(mainInput, sideInputParDo, DefaultEdge.create());

  Network<Node, Edge> inputNetwork = ImmutableNetwork.copyOf(network);
  network = InsertFetchAndFilterStreamingSideInputNodes.with(pipeline).forNetwork(network);

  assertThatNetworksAreIdentical(inputNetwork, network);
}
 
Example 6
Source Project: beam   Source File: GenerateSequenceTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testUnboundedInputTimestamps() {
  long numElements = 1000;

  PCollection<Long> input =
      p.apply(GenerateSequence.from(0).to(numElements).withTimestampFn(new ValueAsTimestampFn()));
  addCountingAsserts(input, 0, numElements);

  PCollection<Long> diffs =
      input
          .apply("TimestampDiff", ParDo.of(new ElementValueDiff()))
          .apply("DistinctTimestamps", Distinct.create());
  // This assert also confirms that diffs only has one unique value.
  PAssert.thatSingleton(diffs).isEqualTo(0L);

  p.run();
}
 
Example 7
Source Project: beam   Source File: KinesisIOIT.java    License: Apache License 2.0 6 votes vote down vote up
/** Write test dataset into Kinesis stream. */
private void runWrite() {
  pipelineWrite
      .apply("Generate Sequence", GenerateSequence.from(0).to((long) numberOfRows))
      .apply("Prepare TestRows", ParDo.of(new TestRow.DeterministicallyConstructTestRowFn()))
      .apply("Prepare Kinesis input records", ParDo.of(new ConvertToBytes()))
      .apply(
          "Write to Kinesis",
          KinesisIO.write()
              .withStreamName(options.getAwsKinesisStream())
              .withPartitioner(new RandomPartitioner())
              .withAWSClientsProvider(
                  options.getAwsAccessKey(),
                  options.getAwsSecretKey(),
                  Regions.fromName(options.getAwsKinesisRegion())));

  pipelineWrite.run().waitUntilFinish();
}
 
Example 8
Source Project: beam   Source File: RabbitMqIO.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<?> expand(PCollection<RabbitMqMessage> input) {
  checkArgument(
      exchange() != null || queue() != null, "Either exchange or queue has to be specified");
  if (exchange() != null) {
    checkArgument(queue() == null, "Queue can't be set in the same time as exchange");
  }
  if (queue() != null) {
    checkArgument(exchange() == null, "Exchange can't be set in the same time as queue");
  }
  if (queueDeclare()) {
    checkArgument(queue() != null, "Queue is required for the queue declare");
  }
  if (exchangeDeclare()) {
    checkArgument(exchange() != null, "Exchange is required for the exchange declare");
  }
  return input.apply(ParDo.of(new WriteFn(this)));
}
 
Example 9
Source Project: beam   Source File: BeamUncollectRel.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Row> expand(PCollectionList<Row> pinput) {
  checkArgument(
      pinput.size() == 1,
      "Wrong number of inputs for %s: %s",
      BeamUncollectRel.class.getSimpleName(),
      pinput);
  PCollection<Row> upstream = pinput.get(0);

  // Each row of the input contains a single array of things to be emitted; Calcite knows
  // what the row looks like
  Schema outputSchema = CalciteUtils.toSchema(getRowType());

  PCollection<Row> uncollected =
      upstream.apply(ParDo.of(new UncollectDoFn(outputSchema))).setRowSchema(outputSchema);

  return uncollected;
}
 
Example 10
Source Project: DataflowTemplates   Source File: AvroToBigtable.java    License: Apache License 2.0 6 votes vote down vote up
public static PipelineResult run(Options options) {
  Pipeline pipeline = Pipeline.create(options);

  BigtableIO.Write write =
      BigtableIO.write()
          .withProjectId(options.getBigtableProjectId())
          .withInstanceId(options.getBigtableInstanceId())
          .withTableId(options.getBigtableTableId());

  pipeline
      .apply("Read from Avro", AvroIO.read(BigtableRow.class).from(options.getInputFilePattern()))
      .apply(
          "Transform to Bigtable",
          ParDo.of(
              AvroToBigtableFn.createWithSplitLargeRows(
                  options.getSplitLargeRows(), MAX_MUTATIONS_PER_ROW)))
      .apply("Write to Bigtable", write);

  return pipeline.run();
}
 
Example 11
Source Project: beam   Source File: StatefulTeamScoreTest.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Tests that {@link UpdateTeamScoreFn} {@link org.apache.beam.sdk.transforms.DoFn} outputs
 * correctly for one team.
 */
@Test
public void testScoreUpdatesOneTeam() {

  TestStream<KV<String, GameActionInfo>> createEvents =
      TestStream.create(KvCoder.of(StringUtf8Coder.of(), AvroCoder.of(GameActionInfo.class)))
          .advanceWatermarkTo(baseTime)
          .addElements(
              event(TestUser.RED_TWO, 99, Duration.standardSeconds(10)),
              event(TestUser.RED_ONE, 1, Duration.standardSeconds(20)),
              event(TestUser.RED_ONE, 0, Duration.standardSeconds(30)),
              event(TestUser.RED_TWO, 100, Duration.standardSeconds(40)),
              event(TestUser.RED_TWO, 201, Duration.standardSeconds(50)))
          .advanceWatermarkToInfinity();

  PCollection<KV<String, Integer>> teamScores =
      p.apply(createEvents).apply(ParDo.of(new UpdateTeamScoreFn(100)));

  String redTeam = TestUser.RED_ONE.getTeam();

  PAssert.that(teamScores)
      .inWindow(GlobalWindow.INSTANCE)
      .containsInAnyOrder(KV.of(redTeam, 100), KV.of(redTeam, 200), KV.of(redTeam, 401));

  p.run().waitUntilFinish();
}
 
Example 12
Source Project: beam   Source File: Task.java    License: Apache License 2.0 6 votes vote down vote up
static PCollectionTuple applyTransform(
    PCollection<Integer> numbers, TupleTag<Integer> numBelow100Tag,
    TupleTag<Integer> numAbove100Tag) {

  return numbers.apply(ParDo.of(new DoFn<Integer, Integer>() {

    @ProcessElement
    public void processElement(@Element Integer number, MultiOutputReceiver out) {
      if (number <= 100) {
        out.get(numBelow100Tag).output(number);
      } else {
        out.get(numAbove100Tag).output(number);
      }
    }

  }).withOutputTags(numBelow100Tag, TupleTagList.of(numAbove100Tag)));
}
 
Example 13
Source Project: beam   Source File: BeamSideInputLookupJoinRelTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testLookupTableRightOuterJoinWithBoundedTable() throws Exception {
  String sql =
      "SELECT o1.order_id, o2.site_name FROM "
          + " SITE_LKP o2 "
          + " RIGHT OUTER JOIN "
          + " ORDER_DETAILS1 o1 "
          + " on "
          + " o1.site_id=o2.site_id ";
  PCollection<Row> rows = compilePipeline(sql, pipeline);
  PAssert.that(rows.apply(ParDo.of(new TestUtils.BeamSqlRow2StringDoFn())))
      .containsInAnyOrder(
          TestUtils.RowsBuilder.ofNullable(
                  Schema.FieldType.INT32,
                  "order_id",
                  nullable,
                  Schema.FieldType.STRING,
                  "site_name",
                  nullable)
              .addRows(1, "SITE1")
              .addRows(2, null)
              .addRows(3, null)
              .getStringRows());
  pipeline.run();
}
 
Example 14
Source Project: beam   Source File: DirectRunnerTest.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Tests that a {@link DoFn} that mutates an input with a bad equals() still fails in the {@link
 * DirectRunner}.
 */
@Test
public void testMutatingInputCoderDoFnError() throws Exception {
  Pipeline pipeline = getPipeline();

  pipeline
      .apply(Create.of(new byte[] {0x1, 0x2, 0x3}, new byte[] {0x4, 0x5, 0x6}))
      .apply(
          ParDo.of(
              new DoFn<byte[], Integer>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  byte[] inputArray = c.element();
                  inputArray[0] = 0xa;
                  c.output(13);
                }
              }));

  thrown.expect(IllegalMutationException.class);
  thrown.expectMessage("Input");
  thrown.expectMessage("must not be mutated");
  pipeline.run();
}
 
Example 15
Source Project: beam   Source File: Log.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<T> input) {
  return input.apply(
      ParDo.of(
          new DoFn<T, T>() {

            @ProcessElement
            public void processElement(
                @Element T element, OutputReceiver<T> out, BoundedWindow window) {

              String message = prefix + element.toString();

              if (!(window instanceof GlobalWindow)) {
                message = message + "  Window:" + window.toString();
              }

              LOG.info(message);

              out.output(element);
            }
          }));
}
 
Example 16
Source Project: DataflowTemplates   Source File: BulkCompressorTest.java    License: Apache License 2.0 6 votes vote down vote up
/** Tests the {@link BulkCompressor.Compressor} performs compression properly. */
@Test
public void testCompressFile() throws Exception {
  // Setup test
  final Compression compression = Compression.GZIP;

  final ValueProvider<String> outputDirectoryProvider =
      pipeline.newProvider(tempFolderCompressedPath.toString());

  final ValueProvider<Compression> compressionProvider = StaticValueProvider.of(compression);

  final Metadata metadata = FileSystems.matchSingleFileSpec(textFile.toString());

  // Execute the compressor
  PCollection<String> lines = pipeline
      .apply("Create File Input", Create.of(metadata))
      .apply("Compress", ParDo.of(new Compressor(outputDirectoryProvider, compressionProvider)))
      .apply("Read the Files", TextIO.readAll().withCompression(Compression.AUTO));

  // Test the result
  PAssert.that(lines).containsInAnyOrder(FILE_CONTENT);
  pipeline.run();
}
 
Example 17
Source Project: beam   Source File: DirectRunnerTest.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Tests that a {@link DoFn} that mutates an output with a good equals() fails in the {@link
 * DirectRunner}.
 */
@Test
public void testMutatingOutputThenTerminateDoFnError() throws Exception {
  Pipeline pipeline = getPipeline();

  pipeline
      .apply(Create.of(42))
      .apply(
          ParDo.of(
              new DoFn<Integer, List<Integer>>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  List<Integer> outputList = Arrays.asList(1, 2, 3, 4);
                  c.output(outputList);
                  outputList.set(0, 37);
                }
              }));

  thrown.expect(IllegalMutationException.class);
  thrown.expectMessage("output");
  thrown.expectMessage("must not be mutated");
  pipeline.run();
}
 
Example 18
Source Project: feast   Source File: BigQueryDeadletterSink.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PCollection<FailedElement> input) {
  TimePartitioning partition = new TimePartitioning().setType("DAY");
  partition.setField(TIMESTAMP_COLUMN);
  input
      .apply("FailedElementToTableRow", ParDo.of(new FailedElementToTableRowFn()))
      .apply(
          "WriteFailedElementsToBigQuery",
          BigQueryIO.writeTableRows()
              .to(getTableSpec())
              .withJsonSchema(getJsonSchema())
              .withTimePartitioning(partition)
              .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
              .withWriteDisposition(WriteDisposition.WRITE_APPEND));
  return PDone.in(input.getPipeline());
}
 
Example 19
Source Project: DataflowTemplates   Source File: BigQueryMerger.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KV<K, V>> expand(PCollection<KV<K, V>> input) {
  return input
      .apply(
          Window.<KV<K, V>>into(new GlobalWindows())
              .discardingFiredPanes()
              .triggering(
                  Repeatedly.forever(
                      AfterProcessingTime.pastFirstElementInPane()
                          .plusDelayOf(Duration.ZERO)
                          .alignedTo(intervalDuration, org.joda.time.Instant.now()))))
      .apply(GroupByKey.create())
      .apply(
          ParDo.of(
              new DoFn<KV<K, Iterable<V>>, KV<K, V>>() {
                @ProcessElement
                public void process(ProcessContext c) {
                  LOG.debug(
                      "TS: {} | Element: {} | Pane: {}", c.timestamp(), c.element(), c.pane());
                  Iterator<V> it = c.element().getValue().iterator();
                  if (it.hasNext()) {
                    c.output(KV.of(c.element().getKey(), it.next()));
                  }
                }
              }));
}
 
Example 20
Source Project: beam   Source File: StarterPipeline.java    License: Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  Pipeline p = Pipeline.create(PipelineOptionsFactory.fromArgs(args).withValidation().create());

  p.apply(Create.of("Hello", "World"))
      .apply(
          MapElements.via(
              new SimpleFunction<String, String>() {
                @Override
                public String apply(String input) {
                  return input.toUpperCase();
                }
              }))
      .apply(
          ParDo.of(
              new DoFn<String, Void>() {
                @ProcessElement
                public void processElement(ProcessContext c) {
                  LOG.info(c.element());
                }
              }));

  p.run();
}
 
Example 21
Source Project: beam   Source File: ParDoTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testSideInputAsMap() {
  PCollectionView<Map<String, Integer>> sideInputView =
      pipeline
          .apply("Create sideInput", Create.of(KV.of("key1", 1), KV.of("key2", 2)))
          .apply(View.asMap());
  PCollection<Integer> input =
      pipeline
          .apply("Create input", Create.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
          .apply(
              ParDo.of(
                      new DoFn<Integer, Integer>() {
                        @ProcessElement
                        public void processElement(ProcessContext c) {
                          Map<String, Integer> sideInputValue = c.sideInput(sideInputView);
                          if (!sideInputValue.containsKey("key" + c.element())) {
                            c.output(c.element());
                          }
                        }
                      })
                  .withSideInputs(sideInputView));
  PAssert.that(input).containsInAnyOrder(3, 4, 5, 6, 7, 8, 9, 10);
  pipeline.run();
}
 
Example 22
Source Project: beam   Source File: NexmarkLauncher.java    License: Apache License 2.0 6 votes vote down vote up
/** Return source of events from Kafka. */
private PCollection<Event> sourceEventsFromKafka(Pipeline p, final Instant now) {
  checkArgument((options.getBootstrapServers() != null), "Missing --bootstrapServers");
  NexmarkUtils.console("Reading events from Kafka Topic %s", options.getKafkaTopic());

  KafkaIO.Read<Long, byte[]> read =
      KafkaIO.<Long, byte[]>read()
          .withBootstrapServers(options.getBootstrapServers())
          .withTopic(options.getKafkaTopic())
          .withKeyDeserializer(LongDeserializer.class)
          .withValueDeserializer(ByteArrayDeserializer.class)
          .withStartReadTime(now)
          .withMaxNumRecords(
              options.getNumEvents() != null ? options.getNumEvents() : Long.MAX_VALUE);

  return p.apply(queryName + ".ReadKafkaEvents", read.withoutMetadata())
      .apply(queryName + ".KafkaToEvents", ParDo.of(BYTEARRAY_TO_EVENT));
}
 
Example 23
Source Project: beam   Source File: TFRecordIOTest.java    License: Apache License 2.0 6 votes vote down vote up
private void runTestWrite(String[] elems, String... base64) throws IOException {
  File tmpFile =
      Files.createTempFile(tempFolder.getRoot().toPath(), "file", ".tfrecords").toFile();
  String filename = tmpFile.getPath();

  PCollection<byte[]> input =
      writePipeline
          .apply(Create.of(Arrays.asList(elems)))
          .apply(ParDo.of(new StringToByteArray()));

  TFRecordIO.Write write = TFRecordIO.write().to(filename).withoutSharding();
  input.apply(write);

  writePipeline.run();

  FileInputStream fis = new FileInputStream(tmpFile);
  String written = BaseEncoding.base64().encode(ByteStreams.toByteArray(fis));
  // bytes written may vary depending the order of elems
  assertThat(written, is(in(base64)));
}
 
Example 24
Source Project: hazelcast-jet-demos   Source File: MyBeamJob.java    License: Apache License 2.0 6 votes vote down vote up
public static Pipeline build(PipelineOptions pipelineOptions) {
	
    Pipeline pipeline = Pipeline.create(pipelineOptions);

	pipeline
	.apply("unbounded-source", 
			Read.from(new MyUnboundedSource("beam-input")))
    .apply("reformat-and-timestamp", 
    		ParDo.of(new MyEnrichAndReformatFn()))
	.apply("window",
			 Window.<String>into(FixedWindows.of(ONE_SECOND))
			 .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane()))
			 .discardingFiredPanes()
			 .withAllowedLateness(ONE_SECOND)
			)
    .apply("sink",
    		FileIO.<String>write()
    		.via(TextIO.sink())
            .to(".")
            .withPrefix("beam-output")
            .withNumShards(1)
    		)
	;

    return pipeline;
}
 
Example 25
Source Project: deployment-examples   Source File: UserScoreTest.java    License: MIT License 6 votes vote down vote up
/** Test that bad input data is dropped appropriately. */
@Test
@Category(ValidatesRunner.class)
public void testUserScoresBadInput() throws Exception {

  PCollection<String> input = p.apply(Create.of(GAME_EVENTS2).withCoder(StringUtf8Coder.of()));

  PCollection<KV<String, Integer>> extract =
      input
          .apply(ParDo.of(new ParseEventFn()))
          .apply(
              MapElements.into(
                      TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
                  .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())));

  PAssert.that(extract).empty();

  p.run().waitUntilFinish();
}
 
Example 26
Source Project: beam   Source File: UserScore.java    License: Apache License 2.0 6 votes vote down vote up
/** Run a batch pipeline. */
// [START DocInclude_USMain]
public static void main(String[] args) throws Exception {
  // Begin constructing a pipeline configured by commandline flags.
  Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
  Pipeline pipeline = Pipeline.create(options);

  // Read events from a text file and parse them.
  pipeline
      .apply(TextIO.read().from(options.getInput()))
      .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
      // Extract and sum username/score pairs from the event data.
      .apply("ExtractUserScore", new ExtractAndSumScore("user"))
      .apply(
          "WriteUserScoreSums", new WriteToText<>(options.getOutput(), configureOutput(), false));

  // Run the batch pipeline.
  pipeline.run().waitUntilFinish();
}
 
Example 27
Source Project: beam   Source File: NexmarkLauncher.java    License: Apache License 2.0 5 votes vote down vote up
/** Return source of events from Pubsub. */
private PCollection<Event> sourceEventsFromPubsub(Pipeline p) {
  NexmarkUtils.console("Reading events from Pubsub %s", pubsubSubscription);

  PubsubIO.Read<PubsubMessage> io =
      PubsubIO.readMessagesWithAttributes()
          .fromSubscription(pubsubSubscription)
          .withIdAttribute(NexmarkUtils.PUBSUB_ID);
  if (!configuration.usePubsubPublishTime) {
    io = io.withTimestampAttribute(NexmarkUtils.PUBSUB_TIMESTAMP);
  }

  return p.apply(queryName + ".ReadPubsubEvents", io)
      .apply(queryName + ".PubsubMessageToEvent", ParDo.of(new PubsubMessageEventDoFn()));
}
 
Example 28
Source Project: beam   Source File: KafkaIOTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testRecordsSink() throws Exception {
  // Simply read from kafka source and write to kafka sink using ProducerRecord transform. Then
  // verify the records are correctly published to mock kafka producer.

  int numElements = 1000;

  try (MockProducerWrapper producerWrapper = new MockProducerWrapper()) {

    ProducerSendCompletionThread completionThread =
        new ProducerSendCompletionThread(producerWrapper.mockProducer).start();

    String topic = "test";

    p.apply(mkKafkaReadTransform(numElements, new ValueAsTimestampFn()).withoutMetadata())
        .apply(ParDo.of(new KV2ProducerRecord(topic)))
        .setCoder(ProducerRecordCoder.of(VarIntCoder.of(), VarLongCoder.of()))
        .apply(
            KafkaIO.<Integer, Long>writeRecords()
                .withBootstrapServers("none")
                .withTopic(topic)
                .withKeySerializer(IntegerSerializer.class)
                .withValueSerializer(LongSerializer.class)
                .withInputTimestamp()
                .withProducerFactoryFn(new ProducerFactoryFn(producerWrapper.producerKey)));

    p.run();

    completionThread.shutdown();

    verifyProducerRecords(producerWrapper.mockProducer, topic, numElements, false, true);
  }
}
 
Example 29
Source Project: beam   Source File: SnowflakeIO.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PBegin input) {
  checkArguments();

  String tmpDirName = makeTmpDirName();
  String stagingBucketDir = String.format("%s/%s/", getStagingBucketName(), tmpDirName);

  PCollection<Void> emptyCollection = input.apply(Create.of((Void) null));

  PCollection<T> output =
      emptyCollection
          .apply(
              ParDo.of(
                  new CopyIntoStageFn(
                      getDataSourceProviderFn(),
                      getQuery(),
                      getTable(),
                      getStorageIntegrationName(),
                      stagingBucketDir,
                      getSnowflakeService())))
          .apply(Reshuffle.viaRandomKey())
          .apply(FileIO.matchAll())
          .apply(FileIO.readMatches())
          .apply(readFiles())
          .apply(ParDo.of(new MapCsvToStringArrayFn()))
          .apply(ParDo.of(new MapStringArrayToUserDataFn<>(getCsvMapper())));

  output.setCoder(getCoder());

  emptyCollection
      .apply(Wait.on(output))
      .apply(ParDo.of(new CleanTmpFilesFromGcsFn(stagingBucketDir)));
  return output;
}
 
Example 30
Source Project: beam   Source File: RequiresStableInputParDoOverrides.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Returns a {@link PTransformOverrideFactory} that inserts a {@link Reshuffle.ViaRandomKey}
 * before a {@link ParDo.SingleOutput} that uses the {@link RequiresStableInput} annotation.
 */
static <InputT, OutputT>
    PTransformOverrideFactory<
            PCollection<InputT>, PCollection<OutputT>, ParDo.SingleOutput<InputT, OutputT>>
        singleOutputOverrideFactory() {
  return new SingleOutputOverrideFactory<>();
}