org.apache.beam.sdk.transforms.View Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.View. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * @param options
 * @param pipeline
 * @param readContent
 * @return
 */
private static PCollection<InputContent> filterAlreadyProcessedUrls(
		PCollection<InputContent> readContent, Pipeline pipeline, 
		IndexerPipelineOptions options) {
	PCollection<InputContent> contentToProcess;
	String query = IndexerPipelineUtils.buildBigQueryProcessedUrlsQuery(options);
	PCollection<KV<String,Long>> alreadyProcessedUrls = pipeline
		.apply("Get processed URLs",BigQueryIO.read().fromQuery(query))
		.apply(ParDo.of(new GetUrlFn()));

	final PCollectionView<Map<String,Long>> alreadyProcessedUrlsSideInput =
		alreadyProcessedUrls.apply(View.<String,Long>asMap());
	  
	contentToProcess = readContent
		.apply(ParDo.of(new FilterProcessedUrls(alreadyProcessedUrlsSideInput))
			.withSideInputs(alreadyProcessedUrlsSideInput));
	return contentToProcess;
}
 
Example #2
Source File: Write.java    From components with Apache License 2.0 6 votes vote down vote up
@Override
public PCollectionView<Integer> expand(PCollection<T> input) {
    return input
            .getPipeline()
            .apply(Create.of(0))
            .apply(
                    "FixedNumShards",
                    ParDo.of(
                            new DoFn<Integer, Integer>() {
                                @ProcessElement
                                public void outputNumShards(ProcessContext ctxt) {
                                    checkArgument(
                                            numShards.isAccessible(),
                                            "NumShards must be accessible at runtime to use constant sharding");
                                    ctxt.output(numShards.get());
                                }
                            }))
            .apply(View.<Integer>asSingleton());
}
 
Example #3
Source File: DLPReidentifyTextTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void throwsExceptionWhenDelimiterIsNullAndHeadersAreSet() {
  PCollectionView<List<String>> header =
      testPipeline.apply(Create.of("header")).apply(View.asList());
  assertThrows(
      "Column delimiter should be set if headers are present.",
      IllegalArgumentException.class,
      () ->
          DLPReidentifyText.newBuilder()
              .setProjectId(PROJECT_ID)
              .setBatchSizeBytes(BATCH_SIZE_SMALL)
              .setReidentifyTemplateName(TEMPLATE_NAME)
              .setHeaderColumns(header)
              .build());
  testPipeline.run().waitUntilFinish();
}
 
Example #4
Source File: DataflowPTransformMatchersTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/** Creates a simple pipeline with a {@link Combine.GroupedValues} with side inputs. */
private static TestPipeline createCombineGroupedValuesWithSideInputsPipeline() {
  TestPipeline pipeline = TestPipeline.create().enableAbandonedNodeEnforcement(false);
  PCollection<KV<String, Integer>> input =
      pipeline
          .apply(Create.of(KV.of("key", 1)))
          .setCoder(KvCoder.of(StringUtf8Coder.of(), VarIntCoder.of()));
  PCollection<String> sideInput = pipeline.apply(Create.of("side input"));
  PCollectionView<String> sideInputView = sideInput.apply(View.asSingleton());

  input
      .apply(GroupByKey.create())
      .apply(
          Combine.<String, Integer, Integer>groupedValues(new SumCombineFnWithContext())
              .withSideInputs(sideInputView));

  return pipeline;
}
 
Example #5
Source File: DataflowPipelineTranslator.java    From beam with Apache License 2.0 6 votes vote down vote up
private <ElemT, ViewT> void translateTyped(
    View.CreatePCollectionView<ElemT, ViewT> transform, TranslationContext context) {
  StepTranslationContext stepContext =
      context.addStep(transform, "CollectionToSingleton");
  PCollection<ElemT> input = context.getInput(transform);
  stepContext.addInput(PropertyNames.PARALLEL_INPUT, input);
  WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
  stepContext.addInput(
      PropertyNames.WINDOWING_STRATEGY,
      byteArrayToJsonString(
          serializeWindowingStrategy(windowingStrategy, context.getPipelineOptions())));
  stepContext.addInput(
      PropertyNames.IS_MERGING_WINDOW_FN,
      !windowingStrategy.getWindowFn().isNonMerging());
  stepContext.addCollectionToSingletonOutput(
      input, PropertyNames.OUTPUT, transform.getView());
}
 
Example #6
Source File: HadoopFormatIO.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Creates {@link PCollectionView} with one {@link Configuration} based on the set source of the
 * configuration.
 *
 * @param input input data
 * @return PCollectionView with single {@link Configuration}
 * @see Builder#withConfiguration(Configuration)
 * @see Builder#withConfigurationTransform(PTransform)
 */
private PCollectionView<Configuration> createConfigurationView(
    PCollection<KV<KeyT, ValueT>> input) {

  PCollectionView<Configuration> config;
  if (configuration != null) {
    config =
        input
            .getPipeline()
            .apply("CreateOutputConfig", Create.<Configuration>of(configuration))
            .apply(View.<Configuration>asSingleton().withDefaultValue(configuration));
  } else {
    config = input.apply("TransformDataIntoConfig", configTransform);
  }

  return config;
}
 
Example #7
Source File: BigQueryIOReadTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testPassThroughThenCleanupExecuted() throws Exception {

  p.apply(Create.empty(VarIntCoder.of()))
      .apply(
          new PassThroughThenCleanup<>(
              new PassThroughThenCleanup.CleanupOperation() {
                @Override
                void cleanup(PassThroughThenCleanup.ContextContainer c) throws Exception {
                  throw new RuntimeException("cleanup executed");
                }
              },
              p.apply("Create1", Create.of("")).apply(View.asSingleton())));

  thrown.expect(RuntimeException.class);
  thrown.expectMessage("cleanup executed");

  p.run();
}
 
Example #8
Source File: DirectGraphVisitorTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void getViewsReturnsViews() {
  PCollectionView<List<String>> listView =
      p.apply("listCreate", Create.of("foo", "bar"))
          .apply(
              ParDo.of(
                  new DoFn<String, String>() {
                    @ProcessElement
                    public void processElement(DoFn<String, String>.ProcessContext c)
                        throws Exception {
                      c.output(Integer.toString(c.element().length()));
                    }
                  }))
          .apply(View.asList());
  PCollectionView<Object> singletonView =
      p.apply("singletonCreate", Create.<Object>of(1, 2, 3)).apply(View.asSingleton());
  p.replaceAll(
      DirectRunner.fromOptions(TestPipeline.testingPipelineOptions())
          .defaultTransformOverrides());
  p.traverseTopologically(visitor);
  assertThat(visitor.getGraph().getViews(), Matchers.containsInAnyOrder(listView, singletonView));
}
 
Example #9
Source File: SideInputContainerTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void writeForElementInMultipleWindowsSucceeds() throws Exception {
  ImmutableList.Builder<WindowedValue<?>> valuesBuilder = ImmutableList.builder();
  for (Object materializedValue :
      materializeValuesFor(singletonView.getPipeline().getOptions(), View.asSingleton(), 2.875)) {
    valuesBuilder.add(
        WindowedValue.of(
            materializedValue,
            FIRST_WINDOW.maxTimestamp().minus(200L),
            ImmutableList.of(FIRST_WINDOW, SECOND_WINDOW),
            PaneInfo.ON_TIME_AND_ONLY_FIRING));
  }
  container.write(singletonView, valuesBuilder.build());
  assertThat(
      container
          .createReaderForViews(ImmutableList.of(singletonView))
          .get(singletonView, FIRST_WINDOW),
      equalTo(2.875));
  assertThat(
      container
          .createReaderForViews(ImmutableList.of(singletonView))
          .get(singletonView, SECOND_WINDOW),
      equalTo(2.875));
}
 
Example #10
Source File: IndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 5 votes vote down vote up
/**
 * @param contentToIndexNotSkipped
 * @param contentNotToIndexSkipped
 * @param pipeline
 * @param options
 * @return
 */
private static ContentToIndexOrNot filterAlreadyProcessedDocuments(
		PCollection<InputContent> contentToIndexNotSkipped, PCollection<InputContent> contentNotToIndexSkipped,
		Pipeline pipeline, IndexerPipelineOptions options) {
	PCollection<KV<String,Long>> alreadyProcessedDocs = null;
	
	if (!options.getWriteTruncate()) {
		String query = IndexerPipelineUtils.buildBigQueryProcessedDocsQuery(options);
		alreadyProcessedDocs = pipeline
			.apply("Get already processed Documents",BigQueryIO.read().fromQuery(query))
			.apply(ParDo.of(new GetDocumentHashFn()));

	} else {
		Map<String, Long> map = new HashMap<String,Long>();
		alreadyProcessedDocs = pipeline
			.apply("Create empty side input of Docs",
				Create.of(map).withCoder(KvCoder.of(StringUtf8Coder.of(),VarLongCoder.of())));
	}			
	
	final PCollectionView<Map<String,Long>> alreadyProcessedDocsSideInput =  
		alreadyProcessedDocs.apply(View.<String,Long>asMap());
	
	PCollectionTuple indexOrNotBasedOnExactDupes = contentToIndexNotSkipped
		.apply("Extract DocumentHash key", ParDo.of(new GetInputContentDocumentHashFn()))
		.apply("Group by DocumentHash key", GroupByKey.<String, InputContent>create())
		.apply("Eliminate InputContent Dupes", ParDo.of(new EliminateInputContentDupes(alreadyProcessedDocsSideInput))
			.withSideInputs(alreadyProcessedDocsSideInput)
			.withOutputTags(PipelineTags.contentToIndexNotExactDupesTag, // main output collection
				TupleTagList.of(PipelineTags.contentNotToIndexExactDupesTag))); // side output collection	
	
	PCollection<InputContent> contentToIndexNotExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentToIndexNotExactDupesTag);
	PCollection<InputContent> contentNotToIndexExactDupes = indexOrNotBasedOnExactDupes.get(PipelineTags.contentNotToIndexExactDupesTag);
	
	// Merge the sets of items that are dupes or skipped
	PCollectionList<InputContent> contentNotToIndexList = PCollectionList.of(contentNotToIndexExactDupes).and(contentNotToIndexSkipped);
	
	ContentToIndexOrNot content = new ContentToIndexOrNot(contentToIndexNotExactDupes, contentNotToIndexList.apply(Flatten.<InputContent>pCollections()));
	return content;
}
 
Example #11
Source File: SideInputHandlerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testIsReady() {
  SideInputHandler sideInputHandler =
      new SideInputHandler(
          ImmutableList.of(view1, view2), InMemoryStateInternals.<Void>forKey(null));

  IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(WINDOW_MSECS_1));

  IntervalWindow secondWindow = new IntervalWindow(new Instant(0), new Instant(WINDOW_MSECS_2));

  // side input should not yet be ready
  assertFalse(sideInputHandler.isReady(view1, firstWindow));

  // add a value for view1
  sideInputHandler.addSideInputValue(
      view1,
      valuesInWindow(
          materializeValuesFor(view1.getPipeline().getOptions(), View.asIterable(), "Hello"),
          new Instant(0),
          firstWindow));

  // now side input should be ready
  assertTrue(sideInputHandler.isReady(view1, firstWindow));

  // second window input should still not be ready
  assertFalse(sideInputHandler.isReady(view1, secondWindow));
}
 
Example #12
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testToSingletonTranslationWithIsmSideInput() throws Exception {
  // A "change detector" test that makes sure the translation
  // of getting a PCollectionView<T> does not change
  // in bad ways during refactor

  DataflowPipelineOptions options = buildPipelineOptions();
  DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);

  Pipeline pipeline = Pipeline.create(options);
  pipeline.apply(Create.of(1)).apply(View.asSingleton());
  DataflowRunner runner = DataflowRunner.fromOptions(options);
  runner.replaceTransforms(pipeline);
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
  Job job =
      translator
          .translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList())
          .getJob();
  assertAllStepOutputsHaveUniqueIds(job);

  List<Step> steps = job.getSteps();
  assertEquals(9, steps.size());

  @SuppressWarnings("unchecked")
  List<Map<String, Object>> toIsmRecordOutputs =
      (List<Map<String, Object>>)
          steps.get(steps.size() - 2).getProperties().get(PropertyNames.OUTPUT_INFO);
  assertTrue(
      Structs.getBoolean(Iterables.getOnlyElement(toIsmRecordOutputs), "use_indexed_format"));

  Step collectionToSingletonStep = steps.get(steps.size() - 1);
  assertEquals("CollectionToSingleton", collectionToSingletonStep.getKind());
}
 
Example #13
Source File: Broadcast.java    From nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Main function for the BEAM program.
 * @param args arguments.
 */
public static void main(final String[] args) {
  final String inputFilePath = args[0];
  final String outputFilePath = args[1];
  final PipelineOptions options = PipelineOptionsFactory.create();
  options.setRunner(NemoPipelineRunner.class);

  final Pipeline p = Pipeline.create(options);
  final PCollection<String> elemCollection = GenericSourceSink.read(p, inputFilePath);
  final PCollectionView<Iterable<String>> allCollection = elemCollection.apply(View.<String>asIterable());

  final PCollection<String> result = elemCollection.apply(ParDo.of(new DoFn<String, String>() {
        @ProcessElement
        public void processElement(final ProcessContext c) {
          final String line = c.element();
          final Iterable<String> all = c.sideInput(allCollection);
          final Optional<String> appended = StreamSupport.stream(all.spliterator(), false)
              .reduce((l, r) -> l + '\n' + r);
          if (appended.isPresent()) {
            c.output("line: " + line + "\n" + appended.get());
          } else {
            c.output("error");
          }
        }
      }).withSideInputs(allCollection)
  );

  GenericSourceSink.write(result, outputFilePath);
  p.run();
}
 
Example #14
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testToSingletonTranslationWithFnApiSideInput() throws Exception {
  // A "change detector" test that makes sure the translation
  // of getting a PCollectionView<T> does not change
  // in bad ways during refactor

  DataflowPipelineOptions options = buildPipelineOptions();
  options.setExperiments(Arrays.asList("beam_fn_api"));
  DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);

  Pipeline pipeline = Pipeline.create(options);
  pipeline.apply(Create.of(1)).apply(View.asSingleton());
  DataflowRunner runner = DataflowRunner.fromOptions(options);
  runner.replaceTransforms(pipeline);
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
  Job job =
      translator
          .translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList())
          .getJob();
  assertAllStepOutputsHaveUniqueIds(job);

  List<Step> steps = job.getSteps();
  assertEquals(9, steps.size());

  Step collectionToSingletonStep = steps.get(steps.size() - 1);
  assertEquals("CollectionToSingleton", collectionToSingletonStep.getKind());

  @SuppressWarnings("unchecked")
  List<Map<String, Object>> ctsOutputs =
      (List<Map<String, Object>>)
          steps.get(steps.size() - 1).getProperties().get(PropertyNames.OUTPUT_INFO);
  assertTrue(Structs.getBoolean(Iterables.getOnlyElement(ctsOutputs), "use_indexed_format"));
}
 
Example #15
Source File: TextRowToMutationTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Test(expected = PipelineExecutionException.class)
public void parseRowToMutationTooManyColumns() throws Exception {
  PCollectionView<Ddl> ddlView =
      pipeline.apply("ddl", Create.of(getTestDdl())).apply(View.asSingleton());
  PCollectionView<Map<String, List<TableManifest.Column>>> tableColumnsMapView =
      pipeline
          .apply(
              "tableColumnsMap",
              Create.<Map<String, List<TableManifest.Column>>>of(getEmptyTableColumnsMap())
                  .withCoder(
                      MapCoder.of(
                          StringUtf8Coder.of(),
                          ListCoder.of(ProtoCoder.of(TableManifest.Column.class)))))
          .apply("Map as view", View.asSingleton());

  PCollection<KV<String, String>> input =
      pipeline.apply(
          "input",
          Create.of(KV.of(testTableName, "123,a string,yet another string,1.23,True,,,,,,,")));
  PCollection<Mutation> mutations =
      input.apply(
          ParDo.of(
                  new TextRowToMutation(
                      ddlView,
                      tableColumnsMapView,
                      columnDelimiter,
                      StaticValueProvider.of('"'),
                      trailingDelimiter,
                      escape,
                      nullString,
                      dateFormat,
                      timestampFormat))
              .withSideInputs(ddlView, tableColumnsMapView));

  pipeline.run();
}
 
Example #16
Source File: SpannerIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollectionView<Transaction> expand(PBegin input) {
  getSpannerConfig().validate();

  return input
      .apply(Create.of(1))
      .apply("Create transaction", ParDo.of(new CreateTransactionFn(this)))
      .apply("As PCollectionView", View.asSingleton());
}
 
Example #17
Source File: PipelineTranslationTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Parameters(name = "{index}")
public static Iterable<Pipeline> testPipelines() {
  Pipeline trivialPipeline = Pipeline.create();
  trivialPipeline.apply(Create.of(1, 2, 3));

  Pipeline sideInputPipeline = Pipeline.create();
  final PCollectionView<String> singletonView =
      sideInputPipeline.apply(Create.of("foo")).apply(View.asSingleton());
  sideInputPipeline
      .apply(Create.of("main input"))
      .apply(
          ParDo.of(
                  new DoFn<String, String>() {
                    @ProcessElement
                    public void process(ProcessContext c) {
                      // actually never executed and no effect on translation
                      c.sideInput(singletonView);
                    }
                  })
              .withSideInputs(singletonView));

  Pipeline complexPipeline = Pipeline.create();
  BigEndianLongCoder customCoder = BigEndianLongCoder.of();
  PCollection<Long> elems = complexPipeline.apply(GenerateSequence.from(0L).to(207L));
  PCollection<Long> counted = elems.apply(Count.globally()).setCoder(customCoder);
  PCollection<Long> windowed =
      counted.apply(
          Window.<Long>into(FixedWindows.of(Duration.standardMinutes(7)))
              .triggering(
                  AfterWatermark.pastEndOfWindow()
                      .withLateFirings(AfterPane.elementCountAtLeast(19)))
              .accumulatingFiredPanes()
              .withAllowedLateness(Duration.standardMinutes(3L)));
  final WindowingStrategy<?, ?> windowedStrategy = windowed.getWindowingStrategy();
  PCollection<KV<String, Long>> keyed = windowed.apply(WithKeys.of("foo"));
  PCollection<KV<String, Iterable<Long>>> grouped = keyed.apply(GroupByKey.create());

  return ImmutableList.of(trivialPipeline, sideInputPipeline, complexPipeline);
}
 
Example #18
Source File: Broadcast.java    From incubator-nemo with Apache License 2.0 5 votes vote down vote up
/**
 * Main function for the BEAM program.
 *
 * @param args arguments.
 */
public static void main(final String[] args) {
  final String inputFilePath = args[0];
  final String outputFilePath = args[1];
  final PipelineOptions options = NemoPipelineOptionsFactory.create();

  final Pipeline p = Pipeline.create(options);
  final PCollection<String> elemCollection = GenericSourceSink.read(p, inputFilePath);
  final PCollectionView<Iterable<String>> allCollection = elemCollection.apply(View.<String>asIterable());

  final PCollection<String> result = elemCollection.apply(ParDo.of(new DoFn<String, String>() {
      @ProcessElement
      public void processElement(final ProcessContext c) {
        final String line = c.element();
        final Iterable<String> all = c.sideInput(allCollection);
        final Optional<String> appended = StreamSupport.stream(all.spliterator(), false)
          .reduce((l, r) -> l + '\n' + r);
        if (appended.isPresent()) {
          c.output("line: " + line + "\n" + appended.get());
        } else {
          c.output("error");
        }
      }
    }).withSideInputs(allCollection)
  );

  GenericSourceSink.write(result, outputFilePath);
  p.run().waitUntilFinish();
}
 
Example #19
Source File: TransformTranslator.java    From beam with Apache License 2.0 5 votes vote down vote up
private static <ReadT, WriteT>
    TransformEvaluator<View.CreatePCollectionView<ReadT, WriteT>> createPCollView() {
  return new TransformEvaluator<View.CreatePCollectionView<ReadT, WriteT>>() {
    @Override
    public void evaluate(
        View.CreatePCollectionView<ReadT, WriteT> transform, EvaluationContext context) {
      Iterable<? extends WindowedValue<?>> iter =
          context.getWindowedValues(context.getInput(transform));
      PCollectionView<WriteT> output = transform.getView();
      Coder<Iterable<WindowedValue<?>>> coderInternal =
          (Coder)
              IterableCoder.of(
                  WindowedValue.getFullCoder(
                      output.getCoderInternal(),
                      output.getWindowingStrategyInternal().getWindowFn().windowCoder()));

      @SuppressWarnings("unchecked")
      Iterable<WindowedValue<?>> iterCast = (Iterable<WindowedValue<?>>) iter;

      context.putPView(output, iterCast, coderInternal);
    }

    @Override
    public String toNativeString() {
      return "<createPCollectionView>";
    }
  };
}
 
Example #20
Source File: EvaluationContextTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Before
public void setup() {
  DirectRunner runner = DirectRunner.fromOptions(PipelineOptionsFactory.create());

  created = p.apply(Create.of(1, 2, 3));
  downstream = created.apply(WithKeys.of("foo"));
  view = created.apply(View.asIterable());
  unbounded = p.apply(GenerateSequence.from(0));

  p.replaceAll(runner.defaultTransformOverrides());

  KeyedPValueTrackingVisitor keyedPValueTrackingVisitor = KeyedPValueTrackingVisitor.create();
  p.traverseTopologically(keyedPValueTrackingVisitor);

  BundleFactory bundleFactory = ImmutableListBundleFactory.create();
  DirectGraphs.performDirectOverrides(p);
  graph = DirectGraphs.getGraph(p);
  context =
      EvaluationContext.create(
          NanosOffsetClock.create(),
          bundleFactory,
          graph,
          keyedPValueTrackingVisitor.getKeyedPValues(),
          Executors.newSingleThreadExecutor());

  createdProducer = graph.getProducer(created);
  downstreamProducer = graph.getProducer(downstream);
  viewProducer = graph.getProducer(view);
  unboundedProducer = graph.getProducer(unbounded);
}
 
Example #21
Source File: PTransformMatchersTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void createViewWithViewFnDifferentViewFn() {
  PCollection<Integer> input = p.apply(Create.of(1));
  PCollectionView<Iterable<Integer>> view = input.apply(View.asIterable());

  // Purposely create a subclass to get a different class then what was expected.
  IterableViewFn<Integer> viewFn =
      new PCollectionViews.IterableViewFn<Integer>(() -> TypeDescriptors.integers()) {};
  CreatePCollectionView<?, ?> createView = CreatePCollectionView.of(view);

  PTransformMatcher matcher = PTransformMatchers.createViewWithViewFn(viewFn.getClass());
  assertThat(matcher.matches(getAppliedTransform(createView)), is(false));
}
 
Example #22
Source File: BoundedSideInputJoin.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Bid> expand(PCollection<Event> events) {

  checkState(getSideInput() != null, "Configuration error: side input is null");

  final PCollectionView<Map<Long, String>> sideInputMap = getSideInput().apply(View.asMap());

  return events
      // Only want the bid events; easier to fake some side input data
      .apply(NexmarkQueryUtil.JUST_BIDS)

      // Map the conversion function over all bids.
      .apply(
          name + ".JoinToFiles",
          ParDo.of(
                  new DoFn<Bid, Bid>() {
                    @ProcessElement
                    public void processElement(ProcessContext c) {
                      Bid bid = c.element();
                      c.output(
                          new Bid(
                              bid.auction,
                              bid.bidder,
                              bid.price,
                              bid.dateTime,
                              c.sideInput(sideInputMap)
                                  .get(bid.bidder % configuration.sideInputRowCount)));
                    }
                  })
              .withSideInputs(sideInputMap));
}
 
Example #23
Source File: SideInputLoadTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private void performTestWithList(
    PCollection<KV<byte[], byte[]>> input, Optional<SyntheticStep> syntheticStep) {
  applyStepIfPresent(input, "Synthetic step", syntheticStep);
  PCollectionView<List<KV<byte[], byte[]>>> sideInput =
      applyWindowingIfPresent(input).apply(View.asList());
  input
      .apply(ParDo.of(new SideInputTestWithList(sideInput)).withSideInputs(sideInput))
      .apply("Collect end time metrics", ParDo.of(runtimeMonitor));
}
 
Example #24
Source File: BatchLoads.java    From beam with Apache License 2.0 5 votes vote down vote up
private PCollectionView<String> createTempFilePrefixView(
    Pipeline p, final PCollectionView<String> jobIdView) {
  return p.apply(Create.of(""))
      .apply(
          "GetTempFilePrefix",
          ParDo.of(
                  new DoFn<String, String>() {
                    @ProcessElement
                    public void getTempFilePrefix(ProcessContext c) {
                      String tempLocationRoot;
                      if (customGcsTempLocation != null) {
                        tempLocationRoot = customGcsTempLocation.get();
                      } else {
                        tempLocationRoot = c.getPipelineOptions().getTempLocation();
                      }
                      String tempLocation =
                          resolveTempLocation(
                              tempLocationRoot, "BigQueryWriteTemp", c.sideInput(jobIdView));
                      LOG.info(
                          "Writing BigQuery temporary files to {} before loading them.",
                          tempLocation);
                      c.output(tempLocation);
                    }
                  })
              .withSideInputs(jobIdView))
      .apply("TempFilePrefixView", View.asSingleton());
}
 
Example #25
Source File: SideInputLoadTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private void performTestWithIterable(
    PCollection<KV<byte[], byte[]>> input, Optional<SyntheticStep> syntheticStep) {
  applyStepIfPresent(input, "Synthetic step", syntheticStep);
  PCollectionView<Iterable<KV<byte[], byte[]>>> sideInput;
  sideInput = applyWindowingIfPresent(input).apply(View.asIterable());
  input
      .apply(ParDo.of(new SideInputTestWithIterable(sideInput)).withSideInputs(sideInput))
      .apply("Collect end time metrics", ParDo.of(runtimeMonitor));
}
 
Example #26
Source File: PAssert.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a {@link SingletonAssert} for the value of the provided {@link PCollection} with the
 * specified reason.
 *
 * <p>Note that the actual value must be coded by a {@link KvCoder}, not just any {@code Coder<K,
 * V>}.
 */
public static <K, V> SingletonAssert<Map<K, Iterable<V>>> thatMultimap(
    String reason, PCollection<KV<K, V>> actual) {
  @SuppressWarnings("unchecked")
  KvCoder<K, V> kvCoder = (KvCoder<K, V>) actual.getCoder();
  return new PCollectionViewAssert<>(
      actual,
      View.asMultimap(),
      MapCoder.of(kvCoder.getKeyCoder(), IterableCoder.of(kvCoder.getValueCoder())),
      PAssertionSite.capture(reason));
}
 
Example #27
Source File: PAssert.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Constructs a {@link SingletonAssert} for the value of the provided {@link PCollection} with the
 * specified reason. The {@link PCollection} must have at most one value per key.
 *
 * <p>Note that the actual value must be coded by a {@link KvCoder}, not just any {@code Coder<K,
 * V>}.
 */
public static <K, V> SingletonAssert<Map<K, V>> thatMap(
    String reason, PCollection<KV<K, V>> actual) {
  @SuppressWarnings("unchecked")
  KvCoder<K, V> kvCoder = (KvCoder<K, V>) actual.getCoder();
  return new PCollectionViewAssert<>(
      actual,
      View.asMap(),
      MapCoder.of(kvCoder.getKeyCoder(), kvCoder.getValueCoder()),
      PAssertionSite.capture(reason));
}
 
Example #28
Source File: CacheTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Test checks how the cache candidates map is populated by the runner when evaluating the
 * pipeline.
 */
@Test
public void cacheCandidatesUpdaterTest() {
  SparkPipelineOptions options = createOptions();
  Pipeline pipeline = Pipeline.create(options);
  PCollection<String> pCollection = pipeline.apply(Create.of("foo", "bar"));

  // First use of pCollection.
  pCollection.apply(Count.globally());
  // Second use of pCollection.
  PCollectionView<List<String>> view = pCollection.apply(View.asList());

  // Internally View.asList() creates a PCollection that underlies the PCollectionView, that
  // PCollection should not be cached as the SparkRunner does not access that PCollection to
  // access the PCollectionView.
  pipeline
      .apply(Create.of("foo", "baz"))
      .apply(
          ParDo.of(
                  new DoFn<String, String>() {
                    @ProcessElement
                    public void processElement(ProcessContext processContext) {
                      if (processContext.sideInput(view).contains(processContext.element())) {
                        processContext.output(processContext.element());
                      }
                    }
                  })
              .withSideInputs(view));

  JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
  EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
  SparkRunner.CacheVisitor cacheVisitor =
      new SparkRunner.CacheVisitor(new TransformTranslator.Translator(), ctxt);
  pipeline.traverseTopologically(cacheVisitor);
  assertEquals(2L, (long) ctxt.getCacheCandidates().get(pCollection));
  assertEquals(1L, ctxt.getCacheCandidates().values().stream().filter(l -> l > 1).count());
}
 
Example #29
Source File: DataflowPipelineTranslatorTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testToIterableTranslationWithIsmSideInput() throws Exception {
  // A "change detector" test that makes sure the translation
  // of getting a PCollectionView<Iterable<T>> does not change
  // in bad ways during refactor

  DataflowPipelineOptions options = buildPipelineOptions();
  DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);

  Pipeline pipeline = Pipeline.create(options);
  pipeline.apply(Create.of(1, 2, 3)).apply(View.asIterable());

  DataflowRunner runner = DataflowRunner.fromOptions(options);
  runner.replaceTransforms(pipeline);
  SdkComponents sdkComponents = createSdkComponents(options);
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
  Job job =
      translator
          .translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList())
          .getJob();
  assertAllStepOutputsHaveUniqueIds(job);

  List<Step> steps = job.getSteps();
  assertEquals(3, steps.size());

  @SuppressWarnings("unchecked")
  List<Map<String, Object>> toIsmRecordOutputs =
      (List<Map<String, Object>>)
          steps.get(steps.size() - 2).getProperties().get(PropertyNames.OUTPUT_INFO);
  assertTrue(
      Structs.getBoolean(Iterables.getOnlyElement(toIsmRecordOutputs), "use_indexed_format"));

  Step collectionToSingletonStep = steps.get(steps.size() - 1);
  assertEquals("CollectionToSingleton", collectionToSingletonStep.getKind());
}
 
Example #30
Source File: SamzaPublishViewTransformOverride.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PTransformReplacement<PCollection<ElemT>, PCollection<ElemT>> getReplacementTransform(
    AppliedPTransform<
            PCollection<ElemT>, PCollection<ElemT>, View.CreatePCollectionView<ElemT, ViewT>>
        transform) {

  @SuppressWarnings("unchecked")
  PCollection<ElemT> input =
      (PCollection<ElemT>) Iterables.getOnlyElement(transform.getInputs().values());

  return PTransformReplacement.of(
      input, new SamzaCreatePCollectionViewTransform<>(transform.getTransform().getView()));
}