Java Code Examples for org.apache.beam.sdk.values.PCollectionTuple#get()

The following examples show how to use org.apache.beam.sdk.values.PCollectionTuple#get() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FileIndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * @param indexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> indexes, Float ratio) {
	
	PCollectionTuple splitAB = indexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	indexes = allIndexSummaries;
	return indexes;
}
 
Example 2
Source File: ParDoTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category(NeedsRunner.class)
public void testMultiOutputChaining() {

  PCollectionTuple filters =
      pipeline.apply(Create.of(Arrays.asList(3, 4, 5, 6))).apply(new MultiFilter());
  PCollection<Integer> by2 = filters.get(MultiFilter.BY2);
  PCollection<Integer> by3 = filters.get(MultiFilter.BY3);

  // Apply additional filters to each operation.
  PCollection<Integer> by2then3 =
      by2.apply("Filter3sAgain", ParDo.of(new MultiFilter.FilterFn(3)));
  PCollection<Integer> by3then2 =
      by3.apply("Filter2sAgain", ParDo.of(new MultiFilter.FilterFn(2)));

  PAssert.that(by2then3).containsInAnyOrder(6);
  PAssert.that(by3then2).containsInAnyOrder(6);
  pipeline.run();
}
 
Example 3
Source File: PubsubIOJsonTable.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<Row> buildIOReader(PBegin begin) {
  PCollectionTuple rowsWithDlq =
      begin
          .apply("ReadFromPubsub", readMessagesWithAttributes())
          .apply(
              "PubsubMessageToRow",
              PubsubMessageToRow.builder()
                  .messageSchema(getSchema())
                  .useDlq(config.useDlq())
                  .useFlatSchema(config.getUseFlatSchema())
                  .build());
  rowsWithDlq.get(MAIN_TAG).setRowSchema(getSchema());

  if (config.useDlq()) {
    rowsWithDlq.get(DLQ_TAG).apply(writeMessagesToDlq());
  }

  return rowsWithDlq.get(MAIN_TAG);
}
 
Example 4
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * @param Document indexes
 * @return a POJO containing 2 PCollections: Unique docs, and Duplicates
 */
private static ContentDuplicateOrNot filterSoftDuplicates(
		PCollection<ContentIndexSummary> indexes) {
	// 
	PCollectionTuple dedupeOrNot = indexes
		.apply("Extract Text grouping key", 
			ParDo.of(new GetContentIndexSummaryKeyFn()))
		.apply("Group by Text grouping key", 
			GroupByKey.<ContentSoftDeduplicationKey, ContentIndexSummary>create())
		.apply("Eliminate Text dupes", 
			ParDo.of(new EliminateTextDupes())
				.withOutputTags(PipelineTags.indexedContentNotToDedupeTag, 
					TupleTagList.of(PipelineTags.indexedContentToDedupeTag))); 	
		
	PCollection<TableRow> dedupedWebresources = 
		dedupeOrNot.get(PipelineTags.indexedContentToDedupeTag)
			.apply(ParDo.of(new CreateWebresourceTableRowFromDupeIndexSummaryFn()));
	
	ContentDuplicateOrNot contentDuplicateOrNot = new ContentDuplicateOrNot(
		dedupeOrNot.get(PipelineTags.indexedContentNotToDedupeTag),
		dedupedWebresources);
	
	return contentDuplicateOrNot;
}
 
Example 5
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * @param filteredIndexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> filteredIndexes, Float ratio) {
	
	PCollectionTuple splitAB = filteredIndexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	filteredIndexes = allIndexSummaries;
	return filteredIndexes;
}
 
Example 6
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 5 votes vote down vote up
/**
 * @param options
 * @param contentToIndex
 * @return
 */
private static PCollection<ContentIndexSummary> indexDocuments(
		IndexerPipelineOptions options,
		PCollection<InputContent> contentToIndex) {
	
	PCollectionTuple alldocuments = contentToIndex
		.apply(ParDo.of(new IndexDocument())
			.withOutputTags(PipelineTags.successfullyIndexed, // main output
				TupleTagList.of(PipelineTags.unsuccessfullyIndexed))); // side output

	PCollection<ContentIndexSummary> indexes = alldocuments
			.get(PipelineTags.successfullyIndexed)
			.setCoder(AvroCoder.of(ContentIndexSummary.class));

	// if the Bigtable admin DB is set, write into dead letter table
	if (options.getBigtableIndexerAdminDB() != null) {
		
		
		PCollection<InputContent> unprocessedDocuments = alldocuments
			.get(PipelineTags.unsuccessfullyIndexed);
		
		BigtableOptions.Builder optionsBuilder =
			new BigtableOptions.Builder()
				.setProjectId(options.getProject())
				.setInstanceId(options.getBigtableIndexerAdminDB());
		BigtableOptions bigtableOptions = optionsBuilder.build();
		
		unprocessedDocuments
			.apply(ParDo.of(new CreateDeadLetterEntries()))
			.apply("Write to Dead Letter table in Bigtable", BigtableIO.write()
					.withBigtableOptions(bigtableOptions)
					.withTableId(IndexerPipelineUtils.DEAD_LETTER_TABLE));
	}
	
	return indexes;
}
 
Example 7
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 5 votes vote down vote up
/**
 * @param contentToIndexNotSkipped
 * @param contentNotToIndexSkipped
 * @param pipeline
 * @param options
 * @return
 */
private static ContentToIndexOrNot filterAlreadyProcessedDocuments(
		PCollection<InputContent> contentToIndexNotSkipped, PCollection<InputContent> contentNotToIndexSkipped,
		Pipeline pipeline, IndexerPipelineOptions options) {
	PCollection<KV<String,Long>> alreadyProcessedDocs = null;
	
	if (!options.getWriteTruncate()) {
		String query = IndexerPipelineUtils.buildBigQueryProcessedDocsQuery(options);
		alreadyProcessedDocs = pipeline
			.apply("Get already processed Documents",BigQueryIO.read().fromQuery(query))
			.apply(ParDo.of(new GetDocumentHashFn()));

	} else {
		Map<String, Long> map = new HashMap<String,Long>();
		alreadyProcessedDocs = pipeline
			.apply("Create empty side input of Docs",
				Create.of(map).withCoder(KvCoder.of(StringUtf8Coder.of(),VarLongCoder.of())));
	}			
	
	final PCollectionView<Map<String,Long>> alreadyProcessedDocsSideInput =  
		alreadyProcessedDocs.apply(View.<String,Long>asMap());
	
	PCollectionTuple indexOrNotBasedOnExactDupes = contentToIndexNotSkipped
		.apply("Extract DocumentHash key", ParDo.of(new GetInputContentDocumentHashFn()))
		.apply("Group by DocumentHash key", GroupByKey.<String, InputContent>create())
		.apply("Eliminate InputContent Dupes", ParDo.of(new EliminateInputContentDupes(alreadyProcessedDocsSideInput))
			.withSideInputs(alreadyProcessedDocsSideInput)
			.withOutputTags(PipelineTags.contentToIndexNotExactDupesTag, // main output collection
				TupleTagList.of(PipelineTags.contentNotToIndexExactDupesTag))); // side output collection	
	
	PCollection<InputContent> contentToIndexNotExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentToIndexNotExactDupesTag);
	PCollection<InputContent> contentNotToIndexExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentNotToIndexExactDupesTag);
	
	// Merge the sets of items that are dupes or skipped
	PCollectionList<InputContent> contentNotToIndexList = PCollectionList.of(contentNotToIndexExactDupes).and(contentNotToIndexSkipped);
	
	ContentToIndexOrNot content = new ContentToIndexOrNot(contentToIndexNotExactDupes, contentNotToIndexList.apply(Flatten.<InputContent>pCollections()));
	return content;
}
 
Example 8
Source File: FileIndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 5 votes vote down vote up
/**
 * @param options
 * @param contentToIndex
 * @return
 */
private static PCollection<ContentIndexSummary> indexDocuments(
		IndexerPipelineOptions options,
		PCollection<InputContent> contentToIndex) {
	
	PCollectionTuple alldocuments = contentToIndex
		.apply(ParDo.of(new IndexDocument())
			.withOutputTags(PipelineTags.successfullyIndexed, // main output
				TupleTagList.of(PipelineTags.unsuccessfullyIndexed))); // side output
		
	PCollection<ContentIndexSummary> indexes = alldocuments
		.get(PipelineTags.successfullyIndexed)
		.setCoder(AvroCoder.of(ContentIndexSummary.class));
	
	// if the Bigtable admin DB is set, write into dead letter table
	if (options.getBigtableIndexerAdminDB() != null) {
		
		PCollection<InputContent> unprocessedDocuments = alldocuments
			.get(PipelineTags.unsuccessfullyIndexed);
		
		BigtableOptions.Builder optionsBuilder =
			new BigtableOptions.Builder()
				.setProjectId(options.getProject())
				.setInstanceId(options.getBigtableIndexerAdminDB());
		BigtableOptions bigtableOptions = optionsBuilder.build();
		
		unprocessedDocuments
			.apply(ParDo.of(new CreateDeadLetterEntries()))
			.apply("Write to Dead Letter table in Bigtable", BigtableIO.write()
					.withBigtableOptions(bigtableOptions)
					.withTableId(IndexerPipelineUtils.DEAD_LETTER_TABLE));
	}
	
	return indexes;
}
 
Example 9
Source File: WithFailures.java    From beam with Apache License 2.0 5 votes vote down vote up
public static <OutputElementT, FailureElementT>
    Result<PCollection<OutputElementT>, FailureElementT> of(
        PCollectionTuple tuple,
        TupleTag<OutputElementT> outputTag,
        TupleTag<FailureElementT> failureTag) {
  return new AutoValue_WithFailures_Result<>(
      tuple.get(outputTag), outputTag, tuple.get(failureTag), failureTag);
}
 
Example 10
Source File: WriteTables.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<TableDestination, String>> expand(
    PCollection<KV<ShardedKey<DestinationT>, List<String>>> input) {
  PCollectionTuple writeTablesOutputs =
      input.apply(
          ParDo.of(new WriteTablesDoFn())
              .withSideInputs(sideInputs)
              .withOutputTags(mainOutputTag, TupleTagList.of(temporaryFilesTag)));

  // Garbage collect temporary files.
  // We mustn't start garbage collecting files until we are assured that the WriteTablesDoFn has
  // succeeded in loading those files and won't be retried. Otherwise, we might fail part of the
  // way through deleting temporary files, and retry WriteTablesDoFn. This will then fail due
  // to missing files, causing either the entire workflow to fail or get stuck (depending on how
  // the runner handles persistent failures).
  writeTablesOutputs
      .get(temporaryFilesTag)
      .setCoder(StringUtf8Coder.of())
      .apply(WithKeys.of((Void) null))
      .setCoder(KvCoder.of(VoidCoder.of(), StringUtf8Coder.of()))
      .apply(
          Window.<KV<Void, String>>into(new GlobalWindows())
              .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))
              .discardingFiredPanes())
      .apply(GroupByKey.create())
      .apply(Values.create())
      .apply(ParDo.of(new GarbageCollectTemporaryFiles()));

  return writeTablesOutputs.get(mainOutputTag);
}
 
Example 11
Source File: SpannerIO.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public SpannerWriteResult expand(PCollection<MutationGroup> input) {
  PCollection<Iterable<MutationGroup>> batches;

  if (spec.getBatchSizeBytes() <= 1
      || spec.getMaxNumMutations() <= 1
      || spec.getMaxNumRows() <= 1) {
    LOG.info("Batching of mutationGroups is disabled");
    TypeDescriptor<Iterable<MutationGroup>> descriptor =
        new TypeDescriptor<Iterable<MutationGroup>>() {};
    batches =
        input.apply(MapElements.into(descriptor).via(element -> ImmutableList.of(element)));
  } else {

    // First, read the Cloud Spanner schema.
    PCollection<Void> schemaSeed =
        input.getPipeline().apply("Create Seed", Create.of((Void) null));
    if (spec.getSchemaReadySignal() != null) {
      // Wait for external signal before reading schema.
      schemaSeed = schemaSeed.apply("Wait for schema", Wait.on(spec.getSchemaReadySignal()));
    }
    final PCollectionView<SpannerSchema> schemaView =
        schemaSeed
            .apply(
                "Read information schema",
                ParDo.of(new ReadSpannerSchema(spec.getSpannerConfig())))
            .apply("Schema View", View.asSingleton());

    // Split the mutations into batchable and unbatchable mutations.
    // Filter out mutation groups too big to be batched.
    PCollectionTuple filteredMutations =
        input
            .apply(
                "RewindowIntoGlobal",
                Window.<MutationGroup>into(new GlobalWindows())
                    .triggering(DefaultTrigger.of())
                    .discardingFiredPanes())
            .apply(
                "Filter Unbatchable Mutations",
                ParDo.of(
                        new BatchableMutationFilterFn(
                            schemaView,
                            UNBATCHABLE_MUTATIONS_TAG,
                            spec.getBatchSizeBytes(),
                            spec.getMaxNumMutations(),
                            spec.getMaxNumRows()))
                    .withSideInputs(schemaView)
                    .withOutputTags(
                        BATCHABLE_MUTATIONS_TAG, TupleTagList.of(UNBATCHABLE_MUTATIONS_TAG)));

    // Build a set of Mutation groups from the current bundle,
    // sort them by table/key then split into batches.
    PCollection<Iterable<MutationGroup>> batchedMutations =
        filteredMutations
            .get(BATCHABLE_MUTATIONS_TAG)
            .apply(
                "Gather Sort And Create Batches",
                ParDo.of(
                        new GatherSortCreateBatchesFn(
                            spec.getBatchSizeBytes(),
                            spec.getMaxNumMutations(),
                            spec.getMaxNumRows(),
                            // Do not group on streaming unless explicitly set.
                            spec.getGroupingFactor()
                                .orElse(
                                    input.isBounded() == IsBounded.BOUNDED
                                        ? DEFAULT_GROUPING_FACTOR
                                        : 1),
                            schemaView))
                    .withSideInputs(schemaView));

    // Merge the batched and unbatchable mutation PCollections and write to Spanner.
    batches =
        PCollectionList.of(filteredMutations.get(UNBATCHABLE_MUTATIONS_TAG))
            .and(batchedMutations)
            .apply("Merge", Flatten.pCollections());
  }

  PCollectionTuple result =
      batches.apply(
          "Write batches to Spanner",
          ParDo.of(
                  new WriteToSpannerFn(
                      spec.getSpannerConfig(), spec.getFailureMode(), FAILED_MUTATIONS_TAG))
              .withOutputTags(MAIN_OUT_TAG, TupleTagList.of(FAILED_MUTATIONS_TAG)));

  return new SpannerWriteResult(
      input.getPipeline(),
      result.get(MAIN_OUT_TAG),
      result.get(FAILED_MUTATIONS_TAG),
      FAILED_MUTATIONS_TAG);
}
 
Example 12
Source File: OpinionAnalysisPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 4 votes vote down vote up
/**
 * This function creates the DAG graph of transforms. It can be called from main()
 * as well as from the ControlPipeline.
 * @param options
 * @return
 * @throws Exception
 */
public static Pipeline createNLPPipeline(IndexerPipelineOptions options) throws Exception {
	
    IndexerPipelineUtils.validateIndexerPipelineOptions(options);
	Pipeline pipeline = Pipeline.create(options);
	
	PCollection<InputContent> readContent;
	PCollection<String> rawInput;
	
	if (options.isStreaming()) {
		
		// Continuously read from a Pub/Sub topic
		rawInput = pipeline.apply("Read from PubSub", 
			PubsubIO.readStrings().fromTopic(
				options.getPubsubTopic())); 
		
	
	} else {
		// Read from GCS files

		rawInput = pipeline.apply("Read from GCS files", 
			Read.from(new RecordFileSource<String>(
				ValueProvider.StaticValueProvider.of(options.getInputFile()), 
				StringUtf8Coder.of(), 
				RecordFileSource.DEFAULT_RECORD_SEPARATOR)));
	}

	readContent = rawInput.apply(ParDo.of(new ParseRawInput()));
	
	// Extract opinions from online opinions
	PCollection<ContentIndexSummary> indexes = readContent
		.apply(ParDo.of(new IndexDocument())) 
		.setCoder(AvroCoder.of(ContentIndexSummary.class));
	

	// Write into BigQuery 
	PCollectionTuple bqrows= indexes
		.apply(ParDo.of(new CreateTableRowsFromIndexSummaryFn())
			.withOutputTags(webresourceTag, // main output collection
				TupleTagList.of(documentTag).and(sentimentTag)) // 2 side output collections
			); 
	
	PCollection<TableRow> webresourceRows = bqrows.get(webresourceTag);
	PCollection<TableRow> documentRows = bqrows.get(documentTag);
	PCollection<TableRow> sentimentRows = bqrows.get(sentimentTag);

	// Append or Overwrite
	WriteDisposition dispo = options.getWriteTruncate() ? 
			WriteDisposition.WRITE_TRUNCATE: WriteDisposition.WRITE_APPEND; 
	
		
	webresourceRows
		.apply("Write to webresource", 
			BigQueryIO.writeTableRows()
				.to(getWebResourceTableReference(options)) 
				.withSchema(getWebResourceSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	documentRows
		.apply("Write to document", 
			BigQueryIO.writeTableRows()
				.to(getDocumentTableReference(options))
				.withSchema(getDocumentTableSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	sentimentRows
		.apply("Write to sentiment", 
			BigQueryIO.writeTableRows()
				.to(getSentimentTableReference(options)) 
				.withSchema(getSentimentSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo));

	
	return pipeline;
}
 
Example 13
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 4 votes vote down vote up
/**
 * @param bqrows
 * @param webresourceRowsUnindexed
 * @param webresourceDeduped
 * @param options
 */
private static void writeAllTablesToBigQuery(PCollectionTuple bqrows,
		PCollection<TableRow> webresourceRowsUnindexed, PCollection<TableRow> webresourceDeduped,
		IndexerPipelineOptions options) {
	PCollection<TableRow> webresourceRows = bqrows.get(PipelineTags.webresourceTag);
	PCollection<TableRow> documentRows = bqrows.get(PipelineTags.documentTag);
	PCollection<TableRow> sentimentRows = bqrows.get(PipelineTags.sentimentTag);

	// Now write to BigQuery
	WriteDisposition dispo = options.getWriteTruncate() ? 
		WriteDisposition.WRITE_TRUNCATE: WriteDisposition.WRITE_APPEND; 
	
	//Merge all collections with WebResource table records
	PCollectionList<TableRow> webresourceRowsList = (webresourceDeduped == null) ?
		PCollectionList.of(webresourceRows).and(webresourceRowsUnindexed) :
		PCollectionList.of(webresourceRows).and(webresourceRowsUnindexed).and(webresourceDeduped);
			
	PCollection<TableRow> allWebresourceRows = 
		webresourceRowsList.apply(Flatten.<TableRow>pCollections());
			
	allWebresourceRows = !options.isStreaming() ? 
		allWebresourceRows.apply("Reshuffle Webresources", new Reshuffle<TableRow>()) : 
		allWebresourceRows;
	
	allWebresourceRows
		.apply("Write to webresource", 
			BigQueryIO.writeTableRows()
				.to(getWebResourcePartitionedTableRef(options)) 
				.withSchema(getWebResourceSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	documentRows = !options.isStreaming() ?
		documentRows.apply("Reshuffle Documents", new Reshuffle<TableRow>()):
		documentRows;
			
	documentRows
		.apply("Write to document", 
			BigQueryIO.writeTableRows()
				.to(getDocumentPartitionedTableRef(options))
				.withSchema(getDocumentTableSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo)); 
	
	sentimentRows = !options.isStreaming() ?
		sentimentRows.apply("Reshuffle Sentiments", new Reshuffle<TableRow>()):
		sentimentRows;
			
	sentimentRows
		.apply("Write to sentiment", 
			BigQueryIO.writeTableRows()
				.to(getSentimentPartitionedTableRef(options)) 
				.withSchema(getSentimentSchema())
				.withCreateDisposition(CreateDisposition.CREATE_NEVER)
				.withWriteDisposition(dispo));
}
 
Example 14
Source File: PubsubMessageToRowTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testSendsInvalidToDLQ() {
  Schema payloadSchema = Schema.builder().addInt32Field("id").addStringField("name").build();

  Schema messageSchema =
      Schema.builder()
          .addDateTimeField("event_timestamp")
          .addMapField("attributes", VARCHAR, VARCHAR)
          .addRowField("payload", payloadSchema)
          .build();

  PCollectionTuple outputs =
      pipeline
          .apply(
              "create",
              Create.timestamped(
                  message(1, map("attr1", "val1"), "{ \"invalid1\" : \"sdfsd\" }"),
                  message(2, map("attr2", "val2"), "{ \"invalid2"),
                  message(3, map("attr", "val"), "{ \"id\" : 3, \"name\" : \"foo\" }"),
                  message(4, map("bttr", "vbl"), "{ \"name\" : \"baz\", \"id\" : 5 }")))
          .apply(
              "convert",
              PubsubMessageToRow.builder()
                  .messageSchema(messageSchema)
                  .useDlq(true)
                  .useFlatSchema(false)
                  .build());

  PCollection<Row> rows = outputs.get(MAIN_TAG);
  PCollection<PubsubMessage> dlqMessages = outputs.get(DLQ_TAG);

  PAssert.that(dlqMessages)
      .satisfies(
          messages -> {
            assertEquals(2, size(messages));
            assertEquals(
                ImmutableSet.of(map("attr1", "val1"), map("attr2", "val2")),
                convertToSet(messages, m -> m.getAttributeMap()));

            assertEquals(
                ImmutableSet.of("{ \"invalid1\" : \"sdfsd\" }", "{ \"invalid2"),
                convertToSet(messages, m -> new String(m.getPayload(), UTF_8)));
            return null;
          });

  PAssert.that(rows)
      .containsInAnyOrder(
          Row.withSchema(messageSchema)
              .addValues(ts(3), map("attr", "val"), row(payloadSchema, 3, "foo"))
              .build(),
          Row.withSchema(messageSchema)
              .addValues(ts(4), map("bttr", "vbl"), row(payloadSchema, 5, "baz"))
              .build());

  pipeline.run();
}
 
Example 15
Source File: PubsubMessageToRowTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testSendsFlatRowInvalidToDLQ() {
  Schema messageSchema =
      Schema.builder()
          .addDateTimeField("event_timestamp")
          .addInt32Field("id")
          .addStringField("name")
          .build();

  PCollectionTuple outputs =
      pipeline
          .apply(
              "create",
              Create.timestamped(
                  message(1, map("attr1", "val1"), "{ \"invalid1\" : \"sdfsd\" }"),
                  message(2, map("attr2", "val2"), "{ \"invalid2"),
                  message(3, map("attr", "val"), "{ \"id\" : 3, \"name\" : \"foo\" }"),
                  message(4, map("bttr", "vbl"), "{ \"name\" : \"baz\", \"id\" : 5 }")))
          .apply(
              "convert",
              PubsubMessageToRow.builder()
                  .messageSchema(messageSchema)
                  .useDlq(true)
                  .useFlatSchema(true)
                  .build());

  PCollection<Row> rows = outputs.get(MAIN_TAG);
  PCollection<PubsubMessage> dlqMessages = outputs.get(DLQ_TAG);

  PAssert.that(dlqMessages)
      .satisfies(
          messages -> {
            assertEquals(2, size(messages));
            assertEquals(
                ImmutableSet.of(map("attr1", "val1"), map("attr2", "val2")),
                convertToSet(messages, m -> m.getAttributeMap()));

            assertEquals(
                ImmutableSet.of("{ \"invalid1\" : \"sdfsd\" }", "{ \"invalid2"),
                convertToSet(messages, m -> new String(m.getPayload(), UTF_8)));
            return null;
          });

  PAssert.that(rows)
      .containsInAnyOrder(
          Row.withSchema(messageSchema)
              .addValues(ts(3), /* map("attr", "val"), */ 3, "foo")
              .build(),
          Row.withSchema(messageSchema)
              .addValues(ts(4), /* map("bttr", "vbl"), */ 5, "baz")
              .build());

  pipeline.run();
}
 
Example 16
Source File: LocalSpannerIO.java    From DataflowTemplates with Apache License 2.0 4 votes vote down vote up
@Override
public SpannerWriteResult expand(PCollection<MutationGroup> input) {
  PCollection<Iterable<MutationGroup>> batches;

  if (spec.getBatchSizeBytes() <= 1
      || spec.getMaxNumMutations() <= 1
      || spec.getMaxNumRows() <= 1) {
    LOG.info("Batching of mutationGroups is disabled");
    TypeDescriptor<Iterable<MutationGroup>> descriptor =
        new TypeDescriptor<Iterable<MutationGroup>>() {};
    batches =
        input.apply(MapElements.into(descriptor).via(element -> ImmutableList.of(element)));
  } else {

    // First, read the Cloud Spanner schema.
    PCollection<Void> schemaSeed =
        input.getPipeline().apply("Create Seed", Create.of((Void) null));
    if (spec.getSchemaReadySignal() != null) {
      // Wait for external signal before reading schema.
      schemaSeed = schemaSeed.apply("Wait for schema", Wait.on(spec.getSchemaReadySignal()));
    }
    final PCollectionView<SpannerSchema> schemaView =
        schemaSeed
            .apply(
                "Read information schema",
                ParDo.of(new LocalReadSpannerSchema(spec.getSpannerConfig())))
            .apply("Schema View", View.asSingleton());

    // Split the mutations into batchable and unbatchable mutations.
    // Filter out mutation groups too big to be batched.
    PCollectionTuple filteredMutations =
        input
            .apply(
                "RewindowIntoGlobal",
                Window.<MutationGroup>into(new GlobalWindows())
                    .triggering(DefaultTrigger.of())
                    .discardingFiredPanes())
            .apply(
                "Filter Unbatchable Mutations",
                ParDo.of(
                        new BatchableMutationFilterFn(
                            schemaView,
                            UNBATCHABLE_MUTATIONS_TAG,
                            spec.getBatchSizeBytes(),
                            spec.getMaxNumMutations(),
                            spec.getMaxNumRows()))
                    .withSideInputs(schemaView)
                    .withOutputTags(
                        BATCHABLE_MUTATIONS_TAG, TupleTagList.of(UNBATCHABLE_MUTATIONS_TAG)));

    // Build a set of Mutation groups from the current bundle,
    // sort them by table/key then split into batches.
    PCollection<Iterable<MutationGroup>> batchedMutations =
        filteredMutations
            .get(BATCHABLE_MUTATIONS_TAG)
            .apply(
                "Gather And Sort",
                ParDo.of(
                        new GatherBundleAndSortFn(
                            spec.getBatchSizeBytes(),
                            spec.getMaxNumMutations(),
                            spec.getMaxNumRows(),
                            // Do not group on streaming unless explicitly set.
                            spec.getGroupingFactor()
                                .orElse(
                                    input.isBounded() == IsBounded.BOUNDED
                                        ? DEFAULT_GROUPING_FACTOR
                                        : 1),
                            schemaView))
                    .withSideInputs(schemaView))
            .apply(
                "Create Batches",
                ParDo.of(
                        new BatchFn(
                            spec.getBatchSizeBytes(),
                            spec.getMaxNumMutations(),
                            spec.getMaxNumRows(),
                            schemaView))
                    .withSideInputs(schemaView));

    // Merge the batched and unbatchable mutation PCollections and write to Spanner.
    batches =
        PCollectionList.of(filteredMutations.get(UNBATCHABLE_MUTATIONS_TAG))
            .and(batchedMutations)
            .apply("Merge", Flatten.pCollections());
  }

  PCollectionTuple result =
      batches.apply(
          "Write batches to Spanner",
          ParDo.of(
                  new WriteToSpannerFn(
                      spec.getSpannerConfig(), spec.getFailureMode(), FAILED_MUTATIONS_TAG))
              .withOutputTags(MAIN_OUT_TAG, TupleTagList.of(FAILED_MUTATIONS_TAG)));

  return new SpannerWriteResult(
      input.getPipeline(),
      result.get(MAIN_OUT_TAG),
      result.get(FAILED_MUTATIONS_TAG),
      FAILED_MUTATIONS_TAG);
}
 
Example 17
Source File: BatchViewOverrides.java    From beam with Apache License 2.0 4 votes vote down vote up
private static <K, V, W extends BoundedWindow, ViewT> PCollection<?> applyForMapLike(
    DataflowRunner runner,
    PCollection<KV<K, V>> input,
    PCollectionView<ViewT> view,
    boolean uniqueKeysExpected)
    throws NonDeterministicException {

  @SuppressWarnings("unchecked")
  Coder<W> windowCoder = (Coder<W>) input.getWindowingStrategy().getWindowFn().windowCoder();

  @SuppressWarnings({"rawtypes", "unchecked"})
  KvCoder<K, V> inputCoder = (KvCoder) input.getCoder();

  // If our key coder is deterministic, we can use the key portion of each KV
  // part of a composite key containing the window , key and index.
  inputCoder.getKeyCoder().verifyDeterministic();

  IsmRecordCoder<WindowedValue<V>> ismCoder =
      coderForMapLike(windowCoder, inputCoder.getKeyCoder(), inputCoder.getValueCoder());

  // Create the various output tags representing the main output containing the data stream
  // and the additional outputs containing the metadata about the size and entry set.
  TupleTag<IsmRecord<WindowedValue<V>>> mainOutputTag = new TupleTag<>();
  TupleTag<KV<Integer, KV<W, Long>>> outputForSizeTag = new TupleTag<>();
  TupleTag<KV<Integer, KV<W, K>>> outputForEntrySetTag = new TupleTag<>();

  // Process all the elements grouped by key hash, and sorted by key and then window
  // outputting to all the outputs defined above.
  PCollectionTuple outputTuple =
      input
          .apply("GBKaSVForData", new GroupByKeyHashAndSortByKeyAndWindow<K, V, W>(ismCoder))
          .apply(
              ParDo.of(
                      new ToIsmRecordForMapLikeDoFn<>(
                          outputForSizeTag,
                          outputForEntrySetTag,
                          windowCoder,
                          inputCoder.getKeyCoder(),
                          ismCoder,
                          uniqueKeysExpected))
                  .withOutputTags(
                      mainOutputTag,
                      TupleTagList.of(
                          ImmutableList.of(outputForSizeTag, outputForEntrySetTag))));

  // Set the coder on the main data output.
  PCollection<IsmRecord<WindowedValue<V>>> perHashWithReifiedWindows =
      outputTuple.get(mainOutputTag);
  perHashWithReifiedWindows.setCoder(ismCoder);

  // Set the coder on the metadata output for size and process the entries
  // producing a [META, Window, 0L] record per window storing the number of unique keys
  // for each window.
  PCollection<KV<Integer, KV<W, Long>>> outputForSize = outputTuple.get(outputForSizeTag);
  outputForSize.setCoder(
      KvCoder.of(VarIntCoder.of(), KvCoder.of(windowCoder, VarLongCoder.of())));
  PCollection<IsmRecord<WindowedValue<V>>> windowMapSizeMetadata =
      outputForSize
          .apply("GBKaSVForSize", new GroupByKeyAndSortValuesOnly<>())
          .apply(ParDo.of(new ToIsmMetadataRecordForSizeDoFn<K, V, W>(windowCoder)));
  windowMapSizeMetadata.setCoder(ismCoder);

  // Set the coder on the metadata output destined to build the entry set and process the
  // entries producing a [META, Window, Index] record per window key pair storing the key.
  PCollection<KV<Integer, KV<W, K>>> outputForEntrySet = outputTuple.get(outputForEntrySetTag);
  outputForEntrySet.setCoder(
      KvCoder.of(VarIntCoder.of(), KvCoder.of(windowCoder, inputCoder.getKeyCoder())));
  PCollection<IsmRecord<WindowedValue<V>>> windowMapKeysMetadata =
      outputForEntrySet
          .apply("GBKaSVForKeys", new GroupByKeyAndSortValuesOnly<>())
          .apply(
              ParDo.of(
                  new ToIsmMetadataRecordForKeyDoFn<K, V, W>(
                      inputCoder.getKeyCoder(), windowCoder)));
  windowMapKeysMetadata.setCoder(ismCoder);

  // Set that all these outputs should be materialized using an indexed format.
  runner.addPCollectionRequiringIndexedFormat(perHashWithReifiedWindows);
  runner.addPCollectionRequiringIndexedFormat(windowMapSizeMetadata);
  runner.addPCollectionRequiringIndexedFormat(windowMapKeysMetadata);

  PCollectionList<IsmRecord<WindowedValue<V>>> outputs =
      PCollectionList.of(
          ImmutableList.of(
              perHashWithReifiedWindows, windowMapSizeMetadata, windowMapKeysMetadata));

  PCollection<IsmRecord<WindowedValue<V>>> flattenedOutputs =
      Pipeline.applyTransform(outputs, Flatten.pCollections());
  flattenedOutputs.apply(CreateDataflowView.forBatch(view));
  return flattenedOutputs;
}
 
Example 18
Source File: PipelineTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<T> expand(PCollectionTuple input) {
  return input.get(tag);
}
 
Example 19
Source File: StreamingWriteTables.java    From beam with Apache License 2.0 4 votes vote down vote up
private <T> PCollection<T> writeAndGetErrors(
    PCollection<KV<TableDestination, ElementT>> input,
    TupleTag<T> failedInsertsTag,
    AtomicCoder<T> coder,
    ErrorContainer<T> errorContainer) {
  BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class);
  int numShards = options.getNumStreamingKeys();

  // A naive implementation would be to simply stream data directly to BigQuery.
  // However, this could occasionally lead to duplicated data, e.g., when
  // a VM that runs this code is restarted and the code is re-run.

  // The above risk is mitigated in this implementation by relying on
  // BigQuery built-in best effort de-dup mechanism.

  // To use this mechanism, each input TableRow is tagged with a generated
  // unique id, which is then passed to BigQuery and used to ignore duplicates
  // We create 50 keys per BigQuery table to generate output on. This is few enough that we
  // get good batching into BigQuery's insert calls, and enough that we can max out the
  // streaming insert quota.
  PCollection<KV<ShardedKey<String>, TableRowInfo<ElementT>>> tagged =
      input
          .apply("ShardTableWrites", ParDo.of(new GenerateShardedTable<>(numShards)))
          .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), elementCoder))
          .apply("TagWithUniqueIds", ParDo.of(new TagWithUniqueIds<>()))
          .setCoder(
              KvCoder.of(
                  ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of(elementCoder)));

  TupleTag<Void> mainOutputTag = new TupleTag<>("mainOutput");

  // To prevent having the same TableRow processed more than once with regenerated
  // different unique ids, this implementation relies on "checkpointing", which is
  // achieved as a side effect of having StreamingWriteFn immediately follow a GBK,
  // performed by Reshuffle.
  PCollectionTuple tuple =
      tagged
          .apply(Reshuffle.of())
          // Put in the global window to ensure that DynamicDestinations side inputs are accessed
          // correctly.
          .apply(
              "GlobalWindow",
              Window.<KV<ShardedKey<String>, TableRowInfo<ElementT>>>into(new GlobalWindows())
                  .triggering(DefaultTrigger.of())
                  .discardingFiredPanes())
          .apply(
              "StreamingWrite",
              ParDo.of(
                      new StreamingWriteFn<>(
                          bigQueryServices,
                          retryPolicy,
                          failedInsertsTag,
                          errorContainer,
                          skipInvalidRows,
                          ignoreUnknownValues,
                          ignoreInsertIds,
                          toTableRow))
                  .withOutputTags(mainOutputTag, TupleTagList.of(failedInsertsTag)));
  PCollection<T> failedInserts = tuple.get(failedInsertsTag);
  failedInserts.setCoder(coder);
  return failedInserts;
}
 
Example 20
Source File: FhirIO.java    From beam with Apache License 2.0 4 votes vote down vote up
private Result(PCollectionTuple pct) {
  this.pct = pct;
  this.resources = pct.get(OUT);
  this.failedReads =
      pct.get(DEAD_LETTER).setCoder(HealthcareIOErrorCoder.of(StringUtf8Coder.of()));
}