org.apache.beam.sdk.values.PCollectionTuple Java Examples

The following examples show how to use org.apache.beam.sdk.values.PCollectionTuple. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FileIndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * @param indexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> indexes, Float ratio) {
	
	PCollectionTuple splitAB = indexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	indexes = allIndexSummaries;
	return indexes;
}
 
Example #2
Source File: BigQueryConverters.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PBegin begin) {
  return begin
      .apply(
          "AvroToEntity",
          BigQueryIO.read(
                  AvroToEntity.newBuilder()
                      .setEntityKind(entityKind())
                      .setUniqueNameColumn(uniqueNameColumn())
                      .setNamespace(namespace())
                      .build())
              .fromQuery(query())
              .withoutValidation()
              .withTemplateCompatibility()
              .usingStandardSql())
      .apply(
          "CheckNoKey",
          CheckNoKey.newBuilder()
              .setFailureTag(failureTag())
              .setSuccessTag(successTag())
              .build());
}
 
Example #3
Source File: BatchStatefulParDoOverrides.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PCollection<KV<K, InputT>> input) {
  DoFn<KV<K, InputT>, OutputT> fn = originalParDo.getFn();
  verifyFnIsStateful(fn);
  DataflowRunner.verifyDoFnSupportedBatch(fn);
  DataflowRunner.verifyStateSupportForWindowingStrategy(input.getWindowingStrategy());

  if (isFnApi) {
    return input.apply(Reshuffle.of()).apply(originalParDo);
  }

  PTransform<
          PCollection<? extends KV<K, Iterable<KV<Instant, WindowedValue<KV<K, InputT>>>>>>,
          PCollectionTuple>
      statefulParDo =
          ParDo.of(new BatchStatefulDoFn<>(fn))
              .withSideInputs(originalParDo.getSideInputs())
              .withOutputTags(
                  originalParDo.getMainOutputTag(), originalParDo.getAdditionalOutputTags());

  return input.apply(new GbkBeforeStatefulParDo<>()).apply(statefulParDo);
}
 
Example #4
Source File: BeamSqlDslArrayTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnnestLiteral() {
  PCollection<Row> input =
      pipeline.apply(
          "boundedInput1",
          Create.empty(TypeDescriptor.of(Row.class)).withRowSchema(INPUT_SCHEMA));

  // Because we have a multi-part FROM the DSL considers it multi-input
  TupleTag<Row> mainTag = new TupleTag<Row>("main") {};
  PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input);

  Schema resultType = Schema.builder().addStringField("f_string").build();

  PCollection<Row> result =
      inputTuple.apply(
          "sqlQuery", SqlTransform.query("SELECT * FROM UNNEST (ARRAY ['a', 'b', 'c'])"));

  PAssert.that(result)
      .containsInAnyOrder(
          Row.withSchema(resultType).addValues("a").build(),
          Row.withSchema(resultType).addValues("b").build(),
          Row.withSchema(resultType).addValues("c").build());

  pipeline.run();
}
 
Example #5
Source File: DatastoreConverters.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PCollection<Entity> entity) {
  TupleTag<Entity> goodTag = new TupleTag<>();

  // Due to the fact that DatastoreIO does non-transactional writing to Datastore, writing the
  // same entity more than once in the same commit is not supported (error "A non-transactional
  // commit may not contain multiple mutations affecting the same entity). Messages with the
  // same key are thus not written to Datastore and instead routed to an error PCollection for
  // further handlig downstream.
  PCollectionTuple entities =
      entity.apply(
          "CheckSameKey",
          CheckSameKey.newBuilder().setErrorTag(errorTag()).setGoodTag(goodTag).build());
  entities
      .get(goodTag)
      .apply("WriteToDatastore", DatastoreIO.v1().write().withProjectId(projectId()));
  return entities;
}
 
Example #6
Source File: BigQueryConverters.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PCollection<FailsafeElement<T, String>> failsafeElements) {
  return failsafeElements.apply(
      "JsonToTableRow",
      ParDo.of(
              new DoFn<FailsafeElement<T, String>, TableRow>() {
                @ProcessElement
                public void processElement(ProcessContext context) {
                  FailsafeElement<T, String> element = context.element();
                  String json = element.getPayload();

                  try {
                    TableRow row = convertJsonToTableRow(json);
                    context.output(row);
                  } catch (Exception e) {
                    context.output(
                        failureTag(),
                        FailsafeElement.of(element)
                            .setErrorMessage(e.getMessage())
                            .setStacktrace(Throwables.getStackTraceAsString(e)));
                  }
                }
              })
          .withOutputTags(successTag(), TupleTagList.of(failureTag())));
}
 
Example #7
Source File: SnsIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testCustomCoder() throws Exception {
  final PublishRequest request1 = createSampleMessage("my_first_message");

  final TupleTag<PublishResult> results = new TupleTag<>();
  final AmazonSNS amazonSnsSuccess = getAmazonSnsMockSuccess();
  final MockCoder mockCoder = new MockCoder();

  final PCollectionTuple snsWrites =
      p.apply(Create.of(request1))
          .apply(
              SnsIO.write()
                  .withTopicName(topicName)
                  .withAWSClientsProvider(new Provider(amazonSnsSuccess))
                  .withResultOutputTag(results)
                  .withCoder(mockCoder));

  final PCollection<Long> publishedResultsSize =
      snsWrites
          .get(results)
          .apply(MapElements.into(TypeDescriptors.strings()).via(result -> result.getMessageId()))
          .apply(Count.globally());
  PAssert.that(publishedResultsSize).containsInAnyOrder(ImmutableList.of(1L));
  p.run().waitUntilFinish();
  assertThat(mockCoder.captured).isNotNull();
}
 
Example #8
Source File: PTransformTranslationTest.java    From beam with Apache License 2.0 6 votes vote down vote up
private static AppliedPTransform<?, ?, ?> multiMultiParDo(Pipeline pipeline) {
  PCollectionView<String> view = pipeline.apply(Create.of("foo")).apply(View.asSingleton());
  PCollection<Long> input = pipeline.apply(GenerateSequence.from(0));
  ParDo.MultiOutput<Long, KV<Long, String>> parDo =
      ParDo.of(new TestDoFn())
          .withSideInputs(view)
          .withOutputTags(
              new TupleTag<KV<Long, String>>() {},
              TupleTagList.of(new TupleTag<KV<String, Long>>() {}));
  PCollectionTuple output = input.apply(parDo);

  Map<TupleTag<?>, PValue> inputs = new HashMap<>();
  inputs.putAll(parDo.getAdditionalInputs());
  inputs.putAll(input.expand());

  return AppliedPTransform
      .<PCollection<Long>, PCollectionTuple, ParDo.MultiOutput<Long, KV<Long, String>>>of(
          "MultiParDoInAndOut", inputs, output.expand(), parDo, pipeline);
}
 
Example #9
Source File: SnsIOTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataWritesToSNS() {
  final PublishRequest request1 = createSampleMessage("my_first_message");
  final PublishRequest request2 = createSampleMessage("my_second_message");

  final TupleTag<PublishResult> results = new TupleTag<>();
  final AmazonSNS amazonSnsSuccess = getAmazonSnsMockSuccess();

  final PCollectionTuple snsWrites =
      p.apply(Create.of(request1, request2))
          .apply(
              SnsIO.write()
                  .withTopicName(topicName)
                  .withRetryConfiguration(
                      SnsIO.RetryConfiguration.create(
                          5, org.joda.time.Duration.standardMinutes(1)))
                  .withAWSClientsProvider(new Provider(amazonSnsSuccess))
                  .withResultOutputTag(results));

  final PCollection<Long> publishedResultsSize = snsWrites.get(results).apply(Count.globally());
  PAssert.that(publishedResultsSize).containsInAnyOrder(ImmutableList.of(2L));
  p.run().waitUntilFinish();
}
 
Example #10
Source File: BeamSqlDslArrayTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testUnnestNamedLiteral() {
  PCollection<Row> input =
      pipeline.apply(
          "boundedInput1",
          Create.empty(TypeDescriptor.of(Row.class)).withRowSchema(INPUT_SCHEMA));

  // Because we have a multi-part FROM the DSL considers it multi-input
  TupleTag<Row> mainTag = new TupleTag<Row>("main") {};
  PCollectionTuple inputTuple = PCollectionTuple.of(mainTag, input);

  Schema resultType = Schema.builder().addStringField("f_string").build();

  PCollection<Row> result =
      inputTuple.apply(
          "sqlQuery",
          SqlTransform.query("SELECT * FROM UNNEST (ARRAY ['a', 'b', 'c']) AS t(f_string)"));

  PAssert.that(result)
      .containsInAnyOrder(
          Row.withSchema(resultType).addValues("a").build(),
          Row.withSchema(resultType).addValues("b").build(),
          Row.withSchema(resultType).addValues("c").build());

  pipeline.run();
}
 
Example #11
Source File: ParDoTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testTaggedOutputUnregisteredExplicitCoder() throws Exception {
  pipeline.enableAbandonedNodeEnforcement(false);

  PCollection<Integer> input = pipeline.apply(Create.of(Arrays.asList(1, 2, 3)));

  final TupleTag<Integer> mainOutputTag = new TupleTag<>("main");
  final TupleTag<TestDummy> additionalOutputTag = new TupleTag<>("unregisteredSide");
  ParDo.MultiOutput<Integer, Integer> pardo =
      ParDo.of(new TaggedOutputDummyFn(mainOutputTag, additionalOutputTag))
          .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag));
  PCollectionTuple outputTuple = input.apply(pardo);

  outputTuple.get(additionalOutputTag).setCoder(new TestDummyCoder());

  outputTuple.get(additionalOutputTag).apply(View.asSingleton());

  assertEquals(new TestDummyCoder(), outputTuple.get(additionalOutputTag).getCoder());
  outputTuple
      .get(additionalOutputTag)
      .finishSpecifyingOutput("ParDo", input, pardo); // Check for crashes
  assertEquals(
      new TestDummyCoder(),
      outputTuple.get(additionalOutputTag).getCoder()); // Check for corruption
}
 
Example #12
Source File: PubSubToMongoDB.java    From DataflowTemplates with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionTuple expand(PCollection<PubsubMessage> input) {

  // Map the incoming messages into FailsafeElements so we can recover from failures
  // across multiple transforms.
  PCollection<FailsafeElement<PubsubMessage, String>> failsafeElements =
          input.apply("MapToRecord", ParDo.of(new PubsubMessageToFailsafeElementFn()));

  // If a Udf is supplied then use it to parse the PubSubMessages.
  if (javascriptTextTransformGcsPath() != null) {
    return failsafeElements.apply(
            "InvokeUDF",
            JavascriptTextTransformer.FailsafeJavascriptUdf.<PubsubMessage>newBuilder()
                    .setFileSystemPath(javascriptTextTransformGcsPath())
                    .setFunctionName(javascriptTextTransformFunctionName())
                    .setSuccessTag(TRANSFORM_OUT)
                    .setFailureTag(TRANSFORM_DEADLETTER_OUT)
                    .build());
  } else {
    return failsafeElements.apply(
            "ProcessPubSubMessages",
            ParDo.of(new ProcessFailsafePubSubFn())
                    .withOutputTags(TRANSFORM_OUT, TupleTagList.of(TRANSFORM_DEADLETTER_OUT)));
  }
}
 
Example #13
Source File: ParDoTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category(ValidatesRunner.class)
public void testParDoWithEmptyTaggedOutput() {
  TupleTag<String> mainOutputTag = new TupleTag<String>("main") {};
  TupleTag<String> additionalOutputTag1 = new TupleTag<String>("additional1") {};
  TupleTag<String> additionalOutputTag2 = new TupleTag<String>("additional2") {};

  PCollectionTuple outputs =
      pipeline
          .apply(Create.empty(VarIntCoder.of()))
          .apply(
              ParDo.of(new TestNoOutputDoFn())
                  .withOutputTags(
                      mainOutputTag,
                      TupleTagList.of(additionalOutputTag1).and(additionalOutputTag2)));

  PAssert.that(outputs.get(mainOutputTag)).empty();

  PAssert.that(outputs.get(additionalOutputTag1)).empty();
  PAssert.that(outputs.get(additionalOutputTag2)).empty();

  pipeline.run();
}
 
Example #14
Source File: ParDoTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testMainOutputUnregisteredExplicitCoder() {

  PCollection<Integer> input = pipeline.apply(Create.of(Arrays.asList(1, 2, 3)));

  final TupleTag<TestDummy> mainOutputTag = new TupleTag<>("unregisteredMain");
  final TupleTag<Integer> additionalOutputTag = new TupleTag<Integer>("additionalOutput") {};
  PCollectionTuple outputTuple =
      input.apply(
          ParDo.of(new MainOutputDummyFn(mainOutputTag, additionalOutputTag))
              .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag)));

  outputTuple.get(mainOutputTag).setCoder(new TestDummyCoder());

  pipeline.run();
}
 
Example #15
Source File: SplittableParDoViaKeyedWorkItems.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollectionTuple expand(
    PCollection<KeyedWorkItem<byte[], KV<InputT, RestrictionT>>> input) {
  return ProcessKeyedElements.createPrimitiveOutputFor(
      input,
      original.getFn(),
      original.getMainOutputTag(),
      original.getAdditionalOutputTags(),
      original.getOutputTagsToCoders(),
      original.getInputWindowingStrategy());
}
 
Example #16
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/**
 * Tests {@link CsvConverters.LineToFailsafeJson} converts a line to a {@link FailsafeElement}
 * correctly using a JSON schema.
 */
@Test
public void testLineToFailsafeJsonNoHeadersJsonSchema() {

  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> lines =
      pipeline.apply(Create.of(RECORD_STRING).withCoder(StringUtf8Coder.of()));

  PCollectionTuple linesTuple = PCollectionTuple.of(CSV_LINES, lines);

  PCollectionTuple failsafe =
      linesTuple.apply(
          "TestLineToFailsafeJson",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(",")
              .setUdfFileSystemPath(null)
              .setUdfFunctionName(null)
              .setJsonSchemaPath(TEST_JSON_SCHEMA__PATH)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfOutputTag(PROCESSING_OUT)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .build());

  PAssert.that(failsafe.get(PROCESSING_OUT))
      .satisfies(
          collection -> {
            FailsafeElement<String, String> result = collection.iterator().next();
            assertThat(result.getPayload(), is(equalTo(JSON_STRING_RECORD)));
            return null;
          });

  pipeline.run();
}
 
Example #17
Source File: RequiresStableInputParDoOverrides.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a {@link PTransformOverrideFactory} that inserts a {@link Reshuffle.ViaRandomKey}
 * before a {@link ParDo.MultiOutput} that uses the {@link RequiresStableInput} annotation.
 */
static <InputT, OutputT>
    PTransformOverrideFactory<
            PCollection<InputT>, PCollectionTuple, ParDo.MultiOutput<InputT, OutputT>>
        multiOutputOverrideFactory() {
  return new MultiOutputOverrideFactory<>();
}
 
Example #18
Source File: CsvConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/** Tests that if different headers are found an exception is thrown. */
@Test(expected = RuntimeException.class)
public void testDifferentHeaders() {

  FailsafeElementCoder<String, String> coder = FAILSAFE_ELEMENT_CODER;

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  PCollection<String> headers =
      pipeline.apply("CreateInput", Create.of(HEADER_STRING, "wrong,header,thing\n"));
  PCollection<String> lines = pipeline.apply("Create lines", Create.of(RECORD_STRING));

  PCollectionTuple readCsvHeadersOut =
      PCollectionTuple.of(CSV_HEADERS, headers).and(CSV_LINES, lines);

  PCollectionTuple test =
      readCsvHeadersOut.apply(
          "TestDifferentHeaders",
          CsvConverters.LineToFailsafeJson.newBuilder()
              .setDelimiter(",")
              .setUdfFileSystemPath(null)
              .setUdfFunctionName(null)
              .setJsonSchemaPath(null)
              .setHeaderTag(CSV_HEADERS)
              .setLineTag(CSV_LINES)
              .setUdfDeadletterTag(PROCESSING_DEADLETTER_OUT)
              .setUdfOutputTag(PROCESSING_OUT)
              .build());

  pipeline.run();
}
 
Example #19
Source File: ParDoTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category({ValidatesRunner.class, UsesSideInputs.class})
public void testMultiOutputParDoWithSideInputsIsCumulative() {

  List<Integer> inputs = Arrays.asList(3, -42, 666);

  final TupleTag<String> mainOutputTag = new TupleTag<String>("main") {};
  final TupleTag<Void> additionalOutputTag = new TupleTag<Void>("output") {};

  PCollectionView<Integer> sideInput1 =
      pipeline
          .apply("CreateSideInput1", Create.of(11))
          .apply("ViewSideInput1", View.asSingleton());
  PCollectionView<Integer> sideInputUnread =
      pipeline
          .apply("CreateSideInputUnread", Create.of(-3333))
          .apply("ViewSideInputUnread", View.asSingleton());
  PCollectionView<Integer> sideInput2 =
      pipeline
          .apply("CreateSideInput2", Create.of(222))
          .apply("ViewSideInput2", View.asSingleton());

  PCollectionTuple outputs =
      pipeline
          .apply(Create.of(inputs))
          .apply(
              ParDo.of(new TestDoFn(Arrays.asList(sideInput1, sideInput2), Arrays.asList()))
                  .withSideInputs(sideInput1)
                  .withSideInputs(sideInputUnread)
                  .withSideInputs(sideInput2)
                  .withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag)));

  PAssert.that(outputs.get(mainOutputTag))
      .satisfies(ParDoTest.HasExpectedOutput.forInput(inputs).andSideInputs(11, 222));

  pipeline.run();
}
 
Example #20
Source File: SplittableParDo.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollectionTuple expand(PCollection<InputT> input) {
  Coder<RestrictionT> restrictionCoder =
      DoFnInvokers.invokerFor(doFn)
          .invokeGetRestrictionCoder(input.getPipeline().getCoderRegistry());
  Coder<WatermarkEstimatorStateT> watermarkEstimatorStateCoder =
      DoFnInvokers.invokerFor(doFn)
          .invokeGetWatermarkEstimatorStateCoder(input.getPipeline().getCoderRegistry());
  Coder<KV<InputT, RestrictionT>> splitCoder = KvCoder.of(input.getCoder(), restrictionCoder);

  PCollection<KV<byte[], KV<InputT, RestrictionT>>> keyedRestrictions =
      input
          .apply(
              "Pair with initial restriction",
              ParDo.of(new PairWithRestrictionFn<InputT, OutputT, RestrictionT>(doFn)))
          .setCoder(splitCoder)
          .apply("Split restriction", ParDo.of(new SplitRestrictionFn<>(doFn)))
          .setCoder(splitCoder)
          // ProcessFn requires all input elements to be in a single window and have a single
          // element per work item. This must precede the unique keying so each key has a single
          // associated element.
          .apply("Explode windows", ParDo.of(new ExplodeWindowsFn<>()))
          .apply("Assign unique key", WithKeys.of(new RandomUniqueKeyFn<>()));

  return keyedRestrictions.apply(
      "ProcessKeyedElements",
      new ProcessKeyedElements<>(
          doFn,
          input.getCoder(),
          restrictionCoder,
          watermarkEstimatorStateCoder,
          (WindowingStrategy<InputT, ?>) input.getWindowingStrategy(),
          sideInputs,
          mainOutputTag,
          additionalOutputTags,
          outputTagsToCoders));
}
 
Example #21
Source File: FileIndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 5 votes vote down vote up
/**
 * @param options
 * @param contentToIndex
 * @return
 */
private static PCollection<ContentIndexSummary> indexDocuments(
		IndexerPipelineOptions options,
		PCollection<InputContent> contentToIndex) {
	
	PCollectionTuple alldocuments = contentToIndex
		.apply(ParDo.of(new IndexDocument())
			.withOutputTags(PipelineTags.successfullyIndexed, // main output
				TupleTagList.of(PipelineTags.unsuccessfullyIndexed))); // side output
		
	PCollection<ContentIndexSummary> indexes = alldocuments
		.get(PipelineTags.successfullyIndexed)
		.setCoder(AvroCoder.of(ContentIndexSummary.class));
	
	// if the Bigtable admin DB is set, write into dead letter table
	if (options.getBigtableIndexerAdminDB() != null) {
		
		PCollection<InputContent> unprocessedDocuments = alldocuments
			.get(PipelineTags.unsuccessfullyIndexed);
		
		BigtableOptions.Builder optionsBuilder =
			new BigtableOptions.Builder()
				.setProjectId(options.getProject())
				.setInstanceId(options.getBigtableIndexerAdminDB());
		BigtableOptions bigtableOptions = optionsBuilder.build();
		
		unprocessedDocuments
			.apply(ParDo.of(new CreateDeadLetterEntries()))
			.apply("Write to Dead Letter table in Bigtable", BigtableIO.write()
					.withBigtableOptions(bigtableOptions)
					.withTableId(IndexerPipelineUtils.DEAD_LETTER_TABLE));
	}
	
	return indexes;
}
 
Example #22
Source File: MapElements.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public WithFailures.Result<PCollection<OutputT>, FailureT> expand(PCollection<InputT> input) {
  checkArgument(exceptionHandler != null, ".exceptionsVia() is required");
  MapFn doFn = new MapFn();
  PCollectionTuple tuple =
      input.apply(
          MapWithFailures.class.getSimpleName(),
          ParDo.of(doFn)
              .withOutputTags(doFn.outputTag, TupleTagList.of(doFn.failureTag))
              .withSideInputs(this.fn.getRequirements().getSideInputs()));
  return WithFailures.Result.of(tuple, doFn.outputTag, doFn.failureTag);
}
 
Example #23
Source File: FilterRowRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public void build(BeamJobContext ctx) {
    String mainLink = ctx.getLinkNameByPortName("input_" + properties.MAIN_CONNECTOR.getName());
    if (!StringUtils.isEmpty(mainLink)) {
        PCollection<IndexedRecord> mainPCollection = ctx.getPCollectionByLinkName(mainLink);
        if (mainPCollection != null) {
            String flowLink = ctx.getLinkNameByPortName("output_" + properties.FLOW_CONNECTOR.getName());
            String rejectLink = ctx.getLinkNameByPortName("output_" + properties.REJECT_CONNECTOR.getName());

            boolean hasFlow = !StringUtils.isEmpty(flowLink);
            boolean hasReject = !StringUtils.isEmpty(rejectLink);

            if (hasFlow && hasReject) {
                // If both of the outputs are present, the DoFn must be used.
                PCollectionTuple outputTuples = mainPCollection.apply(ctx.getPTransformName(),
                        ParDo.of(new FilterRowDoFn(properties)).withOutputTags(flowOutput, TupleTagList.of(rejectOutput)));
                ctx.putPCollectionByLinkName(flowLink, outputTuples.get(flowOutput));
                ctx.putPCollectionByLinkName(rejectLink, outputTuples.get(rejectOutput));
            } else if (hasFlow || hasReject) {
                // If only one of the outputs is present, the predicate can be used for efficiency.
                FilterRowPredicate predicate = hasFlow //
                        ? new FilterRowPredicate(properties) //
                        : new FilterRowPredicate.Negate(properties);
                PCollection<IndexedRecord> output = mainPCollection.apply(ctx.getPTransformName(), Filter.by(predicate));
                ctx.putPCollectionByLinkName(hasFlow ? flowLink : rejectLink, output);
            } else {
                // If neither are specified, then don't do anything. This component could have been cut from the pipeline.
            }
        }
    }
}
 
Example #24
Source File: BigQueryConvertersTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
/** Tests the {@link BigQueryConverters.FailsafeJsonToTableRow} transform with good input. */
@Test
@Category(NeedsRunner.class)
public void testFailsafeJsonToTableRowValidInput() {
  // Test input
  final String payload = "{\"ticker\": \"GOOGL\", \"price\": 1006.94}";
  final Map<String, String> attributes = ImmutableMap.of("id", "0xDb12", "type", "stock");
  final PubsubMessage message = new PubsubMessage(payload.getBytes(), attributes);

  final FailsafeElement<PubsubMessage, String> input = FailsafeElement.of(message, payload);

  // Expected Output
  TableRow expectedRow = new TableRow().set("ticker", "GOOGL").set("price", 1006.94);

  // Register the coder for the pipeline. This prevents having to invoke .setCoder() on
  // many transforms.
  FailsafeElementCoder<PubsubMessage, String> coder =
      FailsafeElementCoder.of(PubsubMessageWithAttributesCoder.of(), StringUtf8Coder.of());

  CoderRegistry coderRegistry = pipeline.getCoderRegistry();
  coderRegistry.registerCoderForType(coder.getEncodedTypeDescriptor(), coder);

  // Build the pipeline
  PCollectionTuple output =
      pipeline
          .apply("CreateInput", Create.of(input).withCoder(coder))
          .apply(
              "JsonToTableRow",
              FailsafeJsonToTableRow.<PubsubMessage>newBuilder()
                  .setSuccessTag(TABLE_ROW_TAG)
                  .setFailureTag(FAILSAFE_ELM_TAG)
                  .build());

  // Assert
  PAssert.that(output.get(TABLE_ROW_TAG)).containsInAnyOrder(expectedRow);
  PAssert.that(output.get(FAILSAFE_ELM_TAG)).empty();

  // Execute the test
  pipeline.run();
}
 
Example #25
Source File: ParDoEvaluatorFactory.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Creates an evaluator for an arbitrary {@link AppliedPTransform} node, with the pieces of the
 * {@link ParDo} unpacked.
 *
 * <p>This can thus be invoked regardless of whether the types in the {@link AppliedPTransform}
 * correspond with the type in the unpacked {@link DoFn}, side inputs, and output tags.
 */
@SuppressWarnings({"unchecked", "rawtypes"})
DoFnLifecycleManagerRemovingTransformEvaluator<InputT> createEvaluator(
    AppliedPTransform<PCollection<InputT>, PCollectionTuple, ?> application,
    PCollection<InputT> mainInput,
    StructuralKey<?> inputBundleKey,
    List<PCollectionView<?>> sideInputs,
    TupleTag<OutputT> mainOutputTag,
    List<TupleTag<?>> additionalOutputTags,
    DoFnSchemaInformation doFnSchemaInformation,
    Map<String, PCollectionView<?>> sideInputMapping)
    throws Exception {
  String stepName = evaluationContext.getStepName(application);
  DirectStepContext stepContext =
      evaluationContext.getExecutionContext(application, inputBundleKey).getStepContext(stepName);

  DoFnLifecycleManager fnManager = fnClones.getUnchecked(application);

  return DoFnLifecycleManagerRemovingTransformEvaluator.wrapping(
      createParDoEvaluator(
          application,
          inputBundleKey,
          mainInput,
          sideInputs,
          mainOutputTag,
          additionalOutputTags,
          stepContext,
          fnManager.get(),
          doFnSchemaInformation,
          sideInputMapping,
          fnManager),
      fnManager);
}
 
Example #26
Source File: BeamSqlNonAsciiTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testDefaultCharsetLiteral() {
  String sql = "SELECT * FROM TABLE_A WHERE f_string = '第四行'";

  PCollection<Row> result =
      PCollectionTuple.of(new TupleTag<>("TABLE_A"), boundedInput1)
          .apply("testCompositeFilter", SqlTransform.query(sql));

  PAssert.that(result).containsInAnyOrder(rowsInTableA.get(3));

  pipeline.run().waitUntilFinish();
}
 
Example #27
Source File: TextTableProvider.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> expand(PCollection<String> input) {
  PCollectionTuple rows =
      input.apply(
          ParDo.of(
                  new DoFn<String, Row>() {
                    @ProcessElement
                    public void processElement(ProcessContext context) {
                      try {
                        context.output(jsonToRow(getObjectMapper(), context.element()));
                      } catch (UnsupportedRowJsonException jsonException) {
                        if (deadLetterFile() != null) {
                          context.output(DLF_TAG, context.element());
                        } else {
                          throw new RuntimeException("Error parsing JSON", jsonException);
                        }
                      }
                    }
                  })
              .withOutputTags(
                  MAIN_TAG,
                  deadLetterFile() != null ? TupleTagList.of(DLF_TAG) : TupleTagList.empty()));

  if (deadLetterFile() != null) {
    rows.get(DLF_TAG).setCoder(StringUtf8Coder.of()).apply(writeJsonToDlf());
  }
  return rows.get(MAIN_TAG).setRowSchema(schema());
}
 
Example #28
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 5 votes vote down vote up
/**
 * @param contentToProcess
 * @return
 */
private static ContentToIndexOrNot filterBasedOnSkipFlag(PCollection<InputContent> contentToProcess) {
	PCollectionTuple indexOrNotBasedOnSkipFlag = contentToProcess
		.apply("Filter items to index based on skipIndexing flag", ParDo.of(new FilterItemsToIndex())
			.withOutputTags(PipelineTags.contentToIndexNotSkippedTag, // main output collection
				TupleTagList.of(PipelineTags.contentNotToIndexSkippedTag))); // side output collection		

	
	ContentToIndexOrNot contentPerSkipFlag = new ContentToIndexOrNot(
		indexOrNotBasedOnSkipFlag.get(PipelineTags.contentToIndexNotSkippedTag), 
		indexOrNotBasedOnSkipFlag.get(PipelineTags.contentNotToIndexSkippedTag));
	
	return contentPerSkipFlag;
}
 
Example #29
Source File: FlattenTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(ValidatesRunner.class)
public void testFlattenMultiplePCollectionsHavingMultipleConsumers() {
  PCollection<String> input = p.apply(Create.of("AA", "BBB", "CC"));
  final TupleTag<String> outputEvenLengthTag = new TupleTag<String>() {};
  final TupleTag<String> outputOddLengthTag = new TupleTag<String>() {};

  PCollectionTuple tuple =
      input.apply(
          ParDo.of(
                  new DoFn<String, String>() {
                    @ProcessElement
                    public void processElement(ProcessContext c) {
                      if (c.element().length() % 2 == 0) {
                        c.output(c.element());
                      } else {
                        c.output(outputOddLengthTag, c.element());
                      }
                    }
                  })
              .withOutputTags(outputEvenLengthTag, TupleTagList.of(outputOddLengthTag)));

  PCollection<String> outputEvenLength = tuple.get(outputEvenLengthTag);
  PCollection<String> outputOddLength = tuple.get(outputOddLengthTag);

  PCollection<String> outputMerged =
      PCollectionList.of(outputEvenLength).and(outputOddLength).apply(Flatten.pCollections());

  PAssert.that(outputMerged).containsInAnyOrder("AA", "BBB", "CC");
  PAssert.that(outputEvenLength).containsInAnyOrder("AA", "CC");
  PAssert.that(outputOddLength).containsInAnyOrder("BBB");

  p.run();
}
 
Example #30
Source File: ReplacementOutputsTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void taggedSucceeds() {
  PCollectionTuple original =
      PCollectionTuple.of(intsTag, ints).and(strsTag, strs).and(moreIntsTag, moreInts);

  Map<PValue, ReplacementOutput> replacements =
      ReplacementOutputs.tagged(
          original.expand(),
          PCollectionTuple.of(strsTag, replacementStrs)
              .and(moreIntsTag, moreReplacementInts)
              .and(intsTag, replacementInts));
  assertThat(
      replacements.keySet(),
      Matchers.containsInAnyOrder(replacementStrs, replacementInts, moreReplacementInts));
  ReplacementOutput intsReplacement = replacements.get(replacementInts);
  ReplacementOutput strsReplacement = replacements.get(replacementStrs);
  ReplacementOutput moreIntsReplacement = replacements.get(moreReplacementInts);

  assertThat(
      intsReplacement,
      equalTo(
          ReplacementOutput.of(
              TaggedPValue.of(intsTag, ints), TaggedPValue.of(intsTag, replacementInts))));
  assertThat(
      strsReplacement,
      equalTo(
          ReplacementOutput.of(
              TaggedPValue.of(strsTag, strs), TaggedPValue.of(strsTag, replacementStrs))));
  assertThat(
      moreIntsReplacement,
      equalTo(
          ReplacementOutput.of(
              TaggedPValue.of(moreIntsTag, moreInts),
              TaggedPValue.of(moreIntsTag, moreReplacementInts))));
}