Java Code Examples for org.apache.beam.sdk.values.PCollection#apply()

The following examples show how to use org.apache.beam.sdk.values.PCollection#apply() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FileIndexerPipeline.java    From dataflow-opinion-analysis with Apache License 2.0 6 votes vote down vote up
/**
 * @param indexes
 * @return
 */
private static PCollection<ContentIndexSummary> enrichWithCNLP(
		PCollection<ContentIndexSummary> indexes, Float ratio) {
	
	PCollectionTuple splitAB = indexes
		.apply(ParDo.of(new SplitAB(ratio))
			.withOutputTags(PipelineTags.BranchA,  
				TupleTagList.of(PipelineTags.BranchB))); 
	
	PCollection<ContentIndexSummary> branchACol = splitAB.get(PipelineTags.BranchA);
	PCollection<ContentIndexSummary> branchBCol = splitAB.get(PipelineTags.BranchB);
	
	PCollection<ContentIndexSummary> enrichedBCol = branchBCol.apply(
		ParDo.of(new EnrichWithCNLPEntities()));
	
	//Merge all collections with WebResource table records
	PCollectionList<ContentIndexSummary> contentIndexSummariesList = 
		PCollectionList.of(branchACol).and(enrichedBCol);
	PCollection<ContentIndexSummary> allIndexSummaries = 
		contentIndexSummariesList.apply(Flatten.<ContentIndexSummary>pCollections());

	indexes = allIndexSummaries;
	return indexes;
}
 
Example 2
Source File: GroupByKeyTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testGroupByKeyNonDeterministic() throws Exception {

  List<KV<Map<String, String>, Integer>> ungroupedPairs = Arrays.asList();

  PCollection<KV<Map<String, String>, Integer>> input =
      p.apply(
          Create.of(ungroupedPairs)
              .withCoder(
                  KvCoder.of(
                      MapCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of()),
                      BigEndianIntegerCoder.of())));

  thrown.expect(IllegalStateException.class);
  thrown.expectMessage("must be deterministic");
  input.apply(GroupByKey.create());
}
 
Example 3
Source File: BeamSqlMultipleSchemasTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testOverrideQualifiedMainSchema() {
  PCollection<Row> inputMain =
      pipeline.apply("mainInput", create(row(1, "pcollection_1"), row(2, "pcollection_2")));

  PCollection<Row> inputExtra =
      pipeline.apply("extraInput", create(row(1, "_extra_table_1"), row(2, "_extra_table_2")));

  TableProvider extraInputProvider = extraTableProvider("extraTable", inputExtra);

  PCollection<Row> result =
      inputMain.apply(
          SqlTransform.query("SELECT f_int, f_string FROM beam.extraTable")
              .withTableProvider("beam", extraInputProvider));

  PAssert.that(result).containsInAnyOrder(row(1, "_extra_table_1"), row(2, "_extra_table_2"));
  pipeline.run();
}
 
Example 4
Source File: SortValuesTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testSecondaryKeySorting() {
  // Create a PCollection of <Key, <SecondaryKey, Value>> pairs.
  PCollection<KV<String, KV<String, Integer>>> input =
      p.apply(
          Create.of(
              Arrays.asList(
                  KV.of("key1", KV.of("secondaryKey2", 20)),
                  KV.of("key2", KV.of("secondaryKey2", 200)),
                  KV.of("key1", KV.of("secondaryKey3", 30)),
                  KV.of("key1", KV.of("secondaryKey1", 10)),
                  KV.of("key2", KV.of("secondaryKey1", 100)))));

  // Group by Key, bringing <SecondaryKey, Value> pairs for the same Key together.
  PCollection<KV<String, Iterable<KV<String, Integer>>>> grouped =
      input.apply(GroupByKey.create());

  // For every Key, sort the iterable of <SecondaryKey, Value> pairs by SecondaryKey.
  PCollection<KV<String, Iterable<KV<String, Integer>>>> groupedAndSorted =
      grouped.apply(SortValues.create(BufferedExternalSorter.options()));

  PAssert.that(groupedAndSorted)
      .satisfies(new AssertThatHasExpectedContentsForTestSecondaryKeySorting());

  p.run();
}
 
Example 5
Source File: Task.java    From beam with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) {
  PipelineOptions options = PipelineOptionsFactory.fromArgs(args).create();
  Pipeline pipeline = Pipeline.create(options);

  PCollection<BigInteger> numbers =
      pipeline.apply(
          Create.of(
              BigInteger.valueOf(10), BigInteger.valueOf(20), BigInteger.valueOf(30),
              BigInteger.valueOf(40), BigInteger.valueOf(50)
          ));

  PCollection<BigInteger> output = applyTransform(numbers);

  output.apply(Log.ofElements());

  pipeline.run();
}
 
Example 6
Source File: BatchViewOverrides.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<Integer, Iterable<KV<W, WindowedValue<T>>>>> expand(
    PCollection<T> input) {
  @SuppressWarnings("unchecked")
  Coder<W> windowCoder = (Coder<W>) input.getWindowingStrategy().getWindowFn().windowCoder();
  PCollection<KV<Integer, KV<W, WindowedValue<T>>>> rval =
      input.apply(
          ParDo.of(new UseWindowHashAsKeyAndWindowAsSortKeyDoFn<T, W>(ismCoderForHash)));
  rval.setCoder(
      KvCoder.of(
          VarIntCoder.of(),
          KvCoder.of(windowCoder, FullWindowedValueCoder.of(input.getCoder(), windowCoder))));
  return rval.apply(new GroupByKeyAndSortValuesOnly<>());
}
 
Example 7
Source File: TextTableProviderTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Tests {@code CREATE EXTERNAL TABLE TYPE text} with a format other than "csv" or "lines" results
 * in a CSV read of that format.
 */
@Test
public void testLegacyTdfCsv() throws Exception {
  Files.write(
      tempFolder.newFile("test.csv").toPath(),
      "hello\t13\n\ngoodbye\t42\n".getBytes(Charsets.UTF_8));

  BeamSqlEnv env = BeamSqlEnv.inMemory(new TextTableProvider());
  env.executeDdl(
      String.format(
          "CREATE EXTERNAL TABLE test %s TYPE text LOCATION '%s/*' TBLPROPERTIES '{\"format\":\"TDF\"}'",
          SQL_CSV_SCHEMA, tempFolder.getRoot()));

  PCollection<Row> rows =
      BeamSqlRelUtils.toPCollection(pipeline, env.parseQuery("SELECT * FROM test"));

  rows.apply(
      MapElements.into(TypeDescriptors.voids())
          .via(
              r -> {
                System.out.println(r.toString());
                return null;
              }));

  PAssert.that(rows)
      .containsInAnyOrder(
          Row.withSchema(CSV_SCHEMA).addValues("hello", 13).build(),
          Row.withSchema(CSV_SCHEMA).addValues("goodbye", 42).build());
  pipeline.run();
}
 
Example 8
Source File: CassandraIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> input) {
  if (mutationType() == MutationType.DELETE) {
    input.apply(ParDo.of(new DeleteFn<>(this)));
  } else {
    input.apply(ParDo.of(new WriteFn<>(this)));
  }
  return PDone.in(input.getPipeline());
}
 
Example 9
Source File: FixedFlowInputRuntimeTest.java    From components with Apache License 2.0 5 votes vote down vote up
@Test
public void test_MultipleInput_OneOutputRow() throws Exception {
    String inputAsString = generateInputJSON(inputSchema, inputIndexedRecord1)
            + generateInputJSON(inputSchema, inputIndexedRecord2);

    FixedFlowInputProperties properties = new FixedFlowInputProperties("test");
    properties.init();
    properties.schemaFlow.schema.setValue(inputSchema);
    properties.values.setValue(inputAsString);
    properties.nbRows.setValue(1);

    FixedFlowInputRuntime runtime = new FixedFlowInputRuntime();
    runtime.initialize(null, properties);

    PCollection<IndexedRecord> indexRecords = pipeline.apply(runtime);
    try (DirectCollector<IndexedRecord> collector = DirectCollector.of()) {
        indexRecords.apply(collector);

        // Run the pipeline to fill the collectors.
        pipeline.run().waitUntilFinish();;

        // Validate the contents of the collected outputs.
        List<IndexedRecord> outputs = collector.getRecords();
        assertEquals(2, outputs.size());
        assertEquals(inputIndexedRecord1.toString(), outputs.get(0).toString());
        assertEquals(inputIndexedRecord2.toString(), outputs.get(1).toString());
    }
}
 
Example 10
Source File: CombineTranslationTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testToProtoWithoutSideInputs() throws Exception {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3));
  CombineFnWithContext<Integer, int[], Integer> combineFn = new TestCombineFnWithContext();
  input.apply(Combine.globally(combineFn).withoutDefaults());
  final AtomicReference<AppliedPTransform<?, ?, Combine.Globally<?, ?>>> combine =
      new AtomicReference<>();
  pipeline.traverseTopologically(
      new PipelineVisitor.Defaults() {
        @Override
        public void leaveCompositeTransform(Node node) {
          if (node.getTransform() instanceof Combine.Globally) {
            checkState(combine.get() == null);
            combine.set((AppliedPTransform) node.toAppliedPTransform(getPipeline()));
          }
        }
      });
  checkState(combine.get() != null);
  assertEquals(combineFn, combine.get().getTransform().getFn());

  SdkComponents sdkComponents = SdkComponents.create();
  sdkComponents.registerEnvironment(Environments.createDockerEnvironment("java"));
  CombinePayload combineProto =
      CombineTranslation.CombineGloballyPayloadTranslator.payloadForCombineGlobally(
          (AppliedPTransform) combine.get(), sdkComponents);
  RunnerApi.Components componentsProto = sdkComponents.toComponents();

  assertEquals(
      combineFn.getAccumulatorCoder(pipeline.getCoderRegistry(), input.getCoder()),
      getAccumulatorCoder(combineProto, RehydratedComponents.forComponents(componentsProto)));
  assertEquals(
      combineFn,
      SerializableUtils.deserializeFromByteArray(
          combineProto.getCombineFn().getPayload().toByteArray(), "CombineFn"));
}
 
Example 11
Source File: ReifyTimestampsTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
@Category(ValidatesRunner.class)
public void extractFromValuesSucceeds() {
  PCollection<KV<String, TimestampedValue<Integer>>> preified =
      pipeline.apply(
          Create.of(
              KV.of("foo", TimestampedValue.of(0, new Instant(0))),
              KV.of("foo", TimestampedValue.of(1, new Instant(1))),
              KV.of("bar", TimestampedValue.of(2, new Instant(2))),
              KV.of("baz", TimestampedValue.of(3, new Instant(3)))));

  PCollection<KV<String, Integer>> timestamped =
      preified.apply(ReifyTimestamps.extractFromValues());

  PAssert.that(timestamped)
      .containsInAnyOrder(KV.of("foo", 0), KV.of("foo", 1), KV.of("bar", 2), KV.of("baz", 3));

  timestamped.apply(
      "AssertElementTimestamps",
      ParDo.of(
          new DoFn<KV<String, Integer>, Void>() {
            @ProcessElement
            public void verifyTimestampsEqualValue(ProcessContext context) {
              assertThat(
                  new Instant(context.element().getValue().longValue()),
                  equalTo(context.timestamp()));
            }
          }));

  pipeline.run();
}
 
Example 12
Source File: SparkPortableExecutionTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test(timeout = 120_000)
public void testExecStageWithMultipleConsumers() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  options.setRunner(CrashingRunner.class);
  options
      .as(PortablePipelineOptions.class)
      .setDefaultEnvironmentType(Environments.ENVIRONMENT_EMBEDDED);
  Pipeline pipeline = Pipeline.create(options);
  PCollection<KV<String, Iterable<String>>> f =
      pipeline
          .apply("impulse", Impulse.create())
          .apply("F", ParDo.of(new DoFnWithSideEffect<>("F")))
          // use GBK to prevent fusion of F, G, and H
          .apply(GroupByKey.create());
  f.apply("G", ParDo.of(new DoFnWithSideEffect<>("G")));
  f.apply("H", ParDo.of(new DoFnWithSideEffect<>("H")));
  RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline);
  JobInvocation jobInvocation =
      SparkJobInvoker.createJobInvocation(
          "testExecStageWithMultipleConsumers",
          "testExecStageWithMultipleConsumersRetrievalToken",
          sparkJobExecutor,
          pipelineProto,
          options.as(SparkPipelineOptions.class));
  jobInvocation.start();
  Assert.assertEquals(Enum.DONE, jobInvocation.getState());
}
 
Example 13
Source File: SingleInputOutputOverrideFactoryTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testMapOutputs() {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3));
  PCollection<Integer> output = input.apply("Map", MapElements.via(fn));
  PCollection<Integer> reappliedOutput = input.apply("ReMap", MapElements.via(fn));
  Map<PValue, ReplacementOutput> replacementMap =
      factory.mapOutputs(output.expand(), reappliedOutput);
  assertThat(
      replacementMap,
      Matchers.hasEntry(
          reappliedOutput,
          ReplacementOutput.of(
              TaggedPValue.ofExpandedValue(output),
              TaggedPValue.ofExpandedValue(reappliedOutput))));
}
 
Example 14
Source File: CoGroupByKeyLoadTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
void loadTest() throws IOException {
  SyntheticSourceOptions coSourceOptions =
      fromJsonString(options.getCoSourceOptions(), SyntheticSourceOptions.class);

  Optional<SyntheticStep> syntheticStep = createStep(options.getStepOptions());

  PCollection<KV<byte[], byte[]>> input =
      pipeline.apply("Read input", readFromSource(sourceOptions));
  input = input.apply("Collect start time metrics (input)", ParDo.of(runtimeMonitor));
  input = applyWindowing(input);
  input = applyStepIfPresent(input, "Synthetic step for input", syntheticStep);

  PCollection<KV<byte[], byte[]>> coInput =
      pipeline.apply("Read co-input", readFromSource(coSourceOptions));
  coInput = coInput.apply("Collect start time metrics (co-input)", ParDo.of(runtimeMonitor));
  coInput = applyWindowing(coInput, options.getCoInputWindowDurationSec());
  coInput = applyStepIfPresent(coInput, "Synthetic step for co-input", syntheticStep);

  KeyedPCollectionTuple.of(INPUT_TAG, input)
      .and(CO_INPUT_TAG, coInput)
      .apply("CoGroupByKey", CoGroupByKey.create())
      .apply("Ungroup and reiterate", ParDo.of(new UngroupAndReiterate(options.getIterations())))
      .apply(
          "Collect total bytes", ParDo.of(new ByteMonitor(METRICS_NAMESPACE, "totalBytes.count")))
      .apply("Collect end time metrics", ParDo.of(runtimeMonitor));
}
 
Example 15
Source File: BeamSideInputJoinRelTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testLeftOuterJoin() throws Exception {
  String sql =
      "SELECT o1.order_id, o1.sum_site_id, o2.buyer FROM "
          + "(select order_id, sum(site_id) as sum_site_id FROM ORDER_DETAILS "
          + "          GROUP BY order_id, TUMBLE(order_time, INTERVAL '1' HOUR)) o1 "
          + " LEFT OUTER JOIN "
          + " ORDER_DETAILS1 o2 "
          + " on "
          + " o1.order_id=o2.order_id";

  PCollection<Row> rows = compilePipeline(sql, pipeline);

  rows.apply(ParDo.of(new BeamSqlOutputToConsoleFn("helloworld")));

  PAssert.that(rows.apply(ParDo.of(new TestUtils.BeamSqlRow2StringDoFn())))
      .containsInAnyOrder(
          TestUtils.RowsBuilder.of(
                  Schema.builder()
                      .addField("order_id", Schema.FieldType.INT32)
                      .addField("sum_site_id", Schema.FieldType.INT32)
                      .addNullableField("buyer", Schema.FieldType.STRING)
                      .build())
              .addRows(1, 3, "james", 2, 5, "bond", 3, 3, null)
              .getStringRows());
  pipeline.run();
}
 
Example 16
Source File: ViewEvaluatorFactoryTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testInMemoryEvaluator() throws Exception {
  PCollection<String> input = p.apply(Create.of("foo", "bar"));
  PCollectionView<Iterable<String>> pCollectionView = input.apply(View.asIterable());
  PCollection<Iterable<String>> concat =
      input
          .apply(WithKeys.of((Void) null))
          .setCoder(KvCoder.of(VoidCoder.of(), StringUtf8Coder.of()))
          .apply(GroupByKey.create())
          .apply(Values.create());
  PCollection<Iterable<String>> view =
      concat.apply(new ViewOverrideFactory.WriteView<>(pCollectionView));

  EvaluationContext context = mock(EvaluationContext.class);
  TestViewWriter<String, Iterable<String>> viewWriter = new TestViewWriter<>();
  when(context.createPCollectionViewWriter(concat, pCollectionView)).thenReturn(viewWriter);

  CommittedBundle<String> inputBundle = bundleFactory.createBundle(input).commit(Instant.now());
  AppliedPTransform<?, ?, ?> producer = DirectGraphs.getProducer(view);
  TransformEvaluator<Iterable<String>> evaluator =
      new ViewEvaluatorFactory(context).forApplication(producer, inputBundle);

  evaluator.processElement(WindowedValue.valueInGlobalWindow(ImmutableList.of("foo", "bar")));
  assertThat(viewWriter.latest, nullValue());

  evaluator.finishBundle();
  assertThat(
      viewWriter.latest,
      containsInAnyOrder(
          WindowedValue.valueInGlobalWindow("foo"), WindowedValue.valueInGlobalWindow("bar")));
}
 
Example 17
Source File: S3OutputRuntimeTestIT.java    From components with Apache License 2.0 5 votes vote down vote up
@Test
public void testParquet_merge() throws IOException {
    S3DatasetProperties datasetProps = s3.createS3DatasetProperties();
    datasetProps.format.setValue(SimpleFileIOFormat.PARQUET);
    S3OutputProperties outputProperties = new S3OutputProperties("out");
    outputProperties.init();
    outputProperties.setDatasetProperties(datasetProps);
    outputProperties.mergeOutput.setValue(true);

    // Create the runtime.
    S3OutputRuntime runtime = new S3OutputRuntime();
    runtime.initialize(null, outputProperties);

    // Use the runtime in a Spark pipeline to test.
    final Pipeline p = spark.createPipeline();
    PCollection<IndexedRecord> input = p.apply( //
            Create.of(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))); //
    input.apply(runtime);

    // And run the test.
    p.run().waitUntilFinish();

    FileSystem s3FileSystem = S3Connection.createFileSystem(datasetProps);
    MiniDfsResource.assertReadParquetFile(s3FileSystem, s3.getS3APath(datasetProps),
            new HashSet<IndexedRecord>(Arrays.asList(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), //
                    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))),
            false);
    MiniDfsResource.assertFileNumber(s3FileSystem, s3.getS3APath(datasetProps), 1);

}
 
Example 18
Source File: Println.java    From gcp-ingestion with Mozilla Public License 2.0 4 votes vote down vote up
@Override
public PDone expand(PCollection<String> input) {
  input.apply(ParDo.of(fn));
  return PDone.in(input.getPipeline());
}
 
Example 19
Source File: TextTableProvider.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<String> expand(PCollection<Row> input) {
  return input.apply(
      "rowToCsv",
      MapElements.into(TypeDescriptors.strings()).via(row -> beamRow2CsvLine(row, csvFormat)));
}
 
Example 20
Source File: LimitRuntime.java    From components with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection expand(PCollection<IndexedRecord> inputPCollection) {
    LimitDoFn doFn = new LimitDoFn().withProperties(properties);
    return inputPCollection.apply(ParDo.of(doFn));
}