Java Code Examples for org.apache.beam.sdk.transforms.MapElements

The following examples show how to use org.apache.beam.sdk.transforms.MapElements. These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source Project: java-docs-samples   Source File: WordCount.java    License: Apache License 2.0 7 votes vote down vote up
public static void main(String[] args) {
  WordCountOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation().as(WordCountOptions.class);

  Pipeline pipeline = Pipeline.create(options);
  pipeline
      .apply("Read lines", TextIO.read().from(options.getInputFile()))
      // [END value_provider]
      .apply("Find words", FlatMapElements.into(TypeDescriptors.strings())
          .via((String line) -> Arrays.asList(line.split("[^\\p{L}]+"))))
      .apply("Filter empty words", Filter.by((String word) -> !word.isEmpty()))
      .apply("Filter with substring", ParDo.of(new FilterWithSubstring(
          options.getWithSubstring(), options.getIsCaseSensitive())))
      .apply("Count words", Count.perElement())
      .apply("Format results", MapElements.into(TypeDescriptors.strings())
          .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue()))
      // [START nested_value_provider]
      .apply("Write results", TextIO.write().to(NestedValueProvider.of(
          options.getOutputBucket(),
          (String bucket) -> String.format("gs://%s/samples/dataflow/wordcount/outputs", bucket)
      )));
      // [END nested_value_provider]
  pipeline.run();
}
 
Example 2
Source Project: incubator-nemo   Source File: WordCount.java    License: Apache License 2.0 6 votes vote down vote up
/**
 * Static method to generate the word count Beam pipeline.
 * @param options options for the pipeline.
 * @param inputFilePath the input file path.
 * @param outputFilePath the output file path.
 * @return the generated pipeline.
 */
static Pipeline generateWordCountPipeline(final PipelineOptions options,
                                                 final String inputFilePath, final String outputFilePath) {
  final Pipeline p = Pipeline.create(options);
  final PCollection<String> result = GenericSourceSink.read(p, inputFilePath)
    .apply(MapElements.<String, KV<String, Long>>via(new SimpleFunction<String, KV<String, Long>>() {
      @Override
      public KV<String, Long> apply(final String line) {
        final String[] words = line.split(" +");
        final String documentId = words[0] + "#" + words[1];
        final Long count = Long.parseLong(words[2]);
        return KV.of(documentId, count);
      }
    }))
    .apply(Sum.longsPerKey())
    .apply(MapElements.<KV<String, Long>, String>via(new SimpleFunction<KV<String, Long>, String>() {
      @Override
      public String apply(final KV<String, Long> kv) {
        return kv.getKey() + ": " + kv.getValue();
      }
    }));
  GenericSourceSink.write(result, outputFilePath);
  return p;
}
 
Example 3
@Override
public Result<PCollection<KV<TableDestination, PubsubMessage>>, PubsubMessage> expand(
    PCollection<PubsubMessage> messages) {
  return messages
      .apply(MapElements.into(TypeDescriptors.kvs(TypeDescriptor.of(TableDestination.class),
          TypeDescriptor.of(PubsubMessage.class))).via((PubsubMessage msg) -> {
            msg = PubsubConstraints.ensureNonNull(msg);
            return KV.of(getTableDestination(msg.getAttributeMap()), msg);
          }).exceptionsInto(TypeDescriptor.of(PubsubMessage.class))
          .exceptionsVia((WithFailures.ExceptionElement<PubsubMessage> ee) -> {
            try {
              throw ee.exception();
            } catch (IllegalArgumentException e) {
              return FailureMessage.of(KeyByBigQueryTableDestination.class.getSimpleName(), //
                  ee.element(), //
                  ee.exception());
            }
          }));
}
 
Example 4
Source Project: beam   Source File: SparkNativePipelineVisitor.java    License: Apache License 2.0 6 votes vote down vote up
private String replaceFnString(
    Class<? extends PTransform> transformClass, String transformString, String fnFieldName)
    throws IllegalAccessException, InvocationTargetException, NoSuchMethodException,
        NoSuchFieldException {
  Object fn =
      transformClass.getMethod("get" + StringUtils.capitalize(fnFieldName)).invoke(transform);
  Class<?> fnClass = fn.getClass();
  String doFnName;
  Class<?> enclosingClass = fnClass.getEnclosingClass();
  if (enclosingClass != null && enclosingClass.equals(MapElements.class)) {
    Field parent = fnClass.getDeclaredField("this$0");
    parent.setAccessible(true);
    Field fnField = enclosingClass.getDeclaredField(fnFieldName);
    fnField.setAccessible(true);
    doFnName = fnField.get(parent.get(fn)).getClass().getName();
  } else {
    doFnName = fnClass.getName();
  }
  transformString = transformString.replace("<" + fnFieldName + ">", doFnName);
  return transformString;
}
 
Example 5
Source Project: gcp-ingestion   Source File: LimitPayloadSize.java    License: Mozilla Public License 2.0 6 votes vote down vote up
/** Factory method to create mapper instance. */
public static MapWithFailures<PubsubMessage, PubsubMessage, PubsubMessage> of(int maxBytes) {
  final Counter countPayloadTooLarge = Metrics.counter(LimitPayloadSize.class,
      "payload_too_large");
  return MapElements.into(TypeDescriptor.of(PubsubMessage.class)).via((PubsubMessage msg) -> {
    msg = PubsubConstraints.ensureNonNull(msg);
    int numBytes = msg.getPayload().length;
    if (numBytes > maxBytes) {
      countPayloadTooLarge.inc();
      throw new PayloadTooLargeException("Message payload is " + numBytes
          + "bytes, larger than the" + " configured limit of " + maxBytes);
    }
    return msg;
  }).exceptionsInto(TypeDescriptor.of(PubsubMessage.class))
      .exceptionsVia((ExceptionElement<PubsubMessage> ee) -> {
        try {
          throw ee.exception();
        } catch (PayloadTooLargeException e) {
          return FailureMessage.of(LimitPayloadSize.class.getSimpleName(), ee.element(),
              ee.exception());
        }
      });
}
 
Example 6
Source Project: beam   Source File: RowToPubsubMessage.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<PubsubMessage> expand(PCollection<Row> input) {
  PCollection<Row> withTimestamp =
      (config.useTimestampAttribute())
          ? input.apply(
              WithTimestamps.of((row) -> row.getDateTime("event_timestamp").toInstant()))
          : input;

  return withTimestamp
      .apply(DropFields.fields("event_timestamp"))
      .apply(ToJson.of())
      .apply(
          MapElements.into(TypeDescriptor.of(PubsubMessage.class))
              .via(
                  (String json) ->
                      new PubsubMessage(
                          json.getBytes(StandardCharsets.ISO_8859_1), ImmutableMap.of())));
}
 
Example 7
@Test
public void testLimit() {
  List<String> passingPayloads = ImmutableList.of("", "abcdefg",
      StringUtils.repeat("abcdefg", 50));
  List<String> failingPayloads = ImmutableList.of(StringUtils.repeat("abcdefghij", 51));

  WithFailures.Result<PCollection<PubsubMessage>, PubsubMessage> result = pipeline //
      .apply(Create.of(Iterables.concat(passingPayloads, failingPayloads))) //
      .apply(InputFileFormat.text.decode()) //
      .apply("LimitPayloadSize", LimitPayloadSize.toBytes(500));

  PAssert
      .that(result.output().apply("get success payload",
          MapElements.into(TypeDescriptors.strings()).via(m -> new String(m.getPayload())))) //
      .containsInAnyOrder(passingPayloads);
  PAssert
      .that(result.failures().apply("get failure payload",
          MapElements.into(TypeDescriptors.strings()).via(m -> new String(m.getPayload())))) //
      .containsInAnyOrder(failingPayloads);

  pipeline.run();
}
 
Example 8
Source Project: beam   Source File: ValidateRunnerXlangTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
@Category({ValidatesRunner.class, UsesCrossLanguageTransforms.class})
public void coGroupByKeyTest() {
  PCollection<KV<Long, String>> col1 =
      testPipeline.apply("createCol1", Create.of(KV.of(0L, "1"), KV.of(0L, "2"), KV.of(1L, "3")));
  PCollection<KV<Long, String>> col2 =
      testPipeline.apply("createCol2", Create.of(KV.of(0L, "4"), KV.of(1L, "5"), KV.of(1L, "6")));
  PCollection<KV<Long, Iterable<String>>> cgbkCol =
      KeyedPCollectionTuple.of("col1", col1)
          .and("col2", col2)
          .apply(External.of(TEST_CGBK_URN, new byte[] {}, expansionAddr));
  PCollection<String> col =
      cgbkCol.apply(
          MapElements.into(TypeDescriptors.strings())
              .via(
                  (KV<Long, Iterable<String>> kv) -> {
                    String[] values = Iterables.toArray(kv.getValue(), String.class);
                    Arrays.sort(values);
                    return String.format("%s:%s", kv.getKey(), String.join(",", values));
                  }));
  PAssert.that(col).containsInAnyOrder("0:1,2,4", "1:3,5,6");
}
 
Example 9
@Override
public PDone expand(PBegin begin) {
  PCollection<Boolean> result =
      begin
          .apply(Create.of(DUMMY_ROW).withRowSchema(DUMMY_SCHEMA))
          .apply(SqlTransform.query("SELECT " + expr))
          .apply(MapElements.into(TypeDescriptors.booleans()).via(row -> row.getBoolean(0)));

  PAssert.that(result)
      .satisfies(
          input -> {
            assertTrue("Test expression is false: " + expr, Iterables.getOnlyElement(input));
            return null;
          });
  return PDone.in(begin.getPipeline());
}
 
Example 10
Source Project: beam   Source File: ValidateRunnerXlangTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
@Category({ValidatesRunner.class, UsesCrossLanguageTransforms.class})
public void groupByKeyTest() {
  PCollection<KV<Long, Iterable<String>>> gbkCol =
      testPipeline
          .apply(Create.of(KV.of(0L, "1"), KV.of(0L, "2"), KV.of(1L, "3")))
          .apply(External.of(TEST_GBK_URN, new byte[] {}, expansionAddr));
  PCollection<String> col =
      gbkCol.apply(
          MapElements.into(TypeDescriptors.strings())
              .via(
                  (KV<Long, Iterable<String>> kv) -> {
                    String[] values = Iterables.toArray(kv.getValue(), String.class);
                    Arrays.sort(values);
                    return String.format("%s:%s", kv.getKey(), String.join(",", values));
                  }));
  PAssert.that(col).containsInAnyOrder("0:1,2", "1:3");
}
 
Example 11
Source Project: beam   Source File: BigQueryIOWriteTest.java    License: Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateNeverWithStreaming() throws Exception {
  p.enableAbandonedNodeEnforcement(false);

  TableReference tableRef = new TableReference();
  tableRef.setDatasetId("dataset");
  tableRef.setTableId("sometable");

  PCollection<TableRow> tableRows =
      p.apply(GenerateSequence.from(0))
          .apply(
              MapElements.via(
                  new SimpleFunction<Long, TableRow>() {
                    @Override
                    public TableRow apply(Long input) {
                      return null;
                    }
                  }))
          .setCoder(TableRowJsonCoder.of());
  tableRows.apply(
      BigQueryIO.writeTableRows()
          .to(tableRef)
          .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_NEVER)
          .withoutValidation());
}
 
Example 12
Source Project: beam   Source File: Window.java    License: Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<T> input) {
  WindowingStrategy<?, ?> outputWindowingStrategy =
      getOutputWindowing(input.getWindowingStrategy());

  return input
      // We first apply a (trivial) transform to the input PCollection to produce a new
      // PCollection. This ensures that we don't modify the windowing strategy of the input
      // which may be used elsewhere.
      .apply(
          "Identity",
          MapElements.via(
              new SimpleFunction<T, T>() {
                @Override
                public T apply(T element) {
                  return element;
                }
              }))
      // Then we modify the windowing strategy.
      .setWindowingStrategyInternal(outputWindowingStrategy);
}
 
Example 13
Source Project: dbeam   Source File: BeamJdbcAvroSchema.java    License: Apache License 2.0 6 votes vote down vote up
/** Generate Avro schema by reading one row. Expose Beam metrics via a Beam PTransform. */
public static Schema createSchema(
    final Pipeline pipeline, final JdbcExportArgs args, final Connection connection)
    throws Exception {
  final long startTime = System.nanoTime();
  final Schema generatedSchema = generateAvroSchema(args, connection);
  final long elapsedTimeSchema = (System.nanoTime() - startTime) / 1000000;
  LOGGER.info("Elapsed time to schema {} seconds", elapsedTimeSchema / 1000.0);

  final Counter cnt =
      Metrics.counter(BeamJdbcAvroSchema.class.getCanonicalName(), "schemaElapsedTimeMs");
  pipeline
      .apply(
          "ExposeSchemaCountersSeed",
          Create.of(Collections.singletonList(0)).withType(TypeDescriptors.integers()))
      .apply(
          "ExposeSchemaCounters",
          MapElements.into(TypeDescriptors.integers())
              .via(
                  v -> {
                    cnt.inc(elapsedTimeSchema);
                    return v;
                  }));
  return generatedSchema;
}
 
Example 14
@org.junit.Test
public void loadArtistCreditsByKey() {

  TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false);

  Long artistCreditIds[] = {634509L, 846332L};
  PCollection<String> text = p.apply(Create.of(artistCreditLinesOfJson)).setCoder(StringUtf8Coder.of());
  PCollection<KV<Long, MusicBrainzDataObject>> artistCredits = 
      MusicBrainzTransforms.loadTableFromText(text, "artist_credit_name", "artist_credit");

  PCollection<Long> artistCreditIdPCollection =
      artistCredits.apply(MapElements
        .into(new TypeDescriptor<Long>() {})
        .via((KV<Long, MusicBrainzDataObject> kv) -> {
            Long k = kv.getKey();
            return k;
          })
      );
  PAssert.that(artistCreditIdPCollection).containsInAnyOrder(634509L, 846332L);
}
 
Example 15
/** Decoder from json to PubsubMessage. */
public static PTransform<PCollection<? extends String>, PCollection<PubsubMessage>> json() {
  return PTransform.compose("DecodePubsubMessages.Json", input -> input
      .apply(MapElements.into(TypeDescriptor.of(PubsubMessage.class)).via((String s) -> {
        try {
          return com.mozilla.telemetry.util.Json.readPubsubMessage(s);
        } catch (IOException e) {
          throw new UncheckedIOException(e);
        }
      })));
}
 
Example 16
Source Project: gcp-ingestion   Source File: Deduplicate.java    License: Mozilla Public License 2.0 5 votes vote down vote up
/**
 * Strip the payload from duplicate messages and add them to the error collection, returning
 * a {@link Result} of the non-duplicate output and the error collection.
 */
public WithFailures.Result<PCollection<PubsubMessage>, //
    PubsubMessage> sendDuplicateMetadataToErrors() {
  PCollection<PubsubMessage> duplicateMetadata = tuple().get(duplicateTag())
      .apply("DropDuplicatePayloads", MapElements //
          .into(TypeDescriptor.of(PubsubMessage.class))
          .via(message -> FailureMessage.of("Duplicate",
              new PubsubMessage("".getBytes(StandardCharsets.UTF_8),
                  message.getAttributeMap()),
              new DuplicateIdException())));
  PCollection<PubsubMessage> errors = PCollectionList.of(tuple().get(errorTag()))
      .and(duplicateMetadata)
      .apply("FlattenDuplicateMetadataAndErrors", Flatten.pCollections());
  return WithFailures.Result.of(tuple().get(outputTag()), errors);
}
 
Example 17
Source Project: beam   Source File: StatefulTeamScore.java    License: Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    // Enforce that this pipeline is always run in streaming mode.
    options.setStreaming(true);
    ExampleUtils exampleUtils = new ExampleUtils(options);
    Pipeline pipeline = Pipeline.create(options);

    pipeline
        // Read game events from Pub/Sub using custom timestamps, which are extracted from the
        // pubsub data elements, and parse the data.
        .apply(
            PubsubIO.readStrings()
                .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE)
                .fromTopic(options.getTopic()))
        .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
        // Create <team, GameActionInfo> mapping. UpdateTeamScore uses team name as key.
        .apply(
            "MapTeamAsKey",
            MapElements.into(
                    TypeDescriptors.kvs(
                        TypeDescriptors.strings(), TypeDescriptor.of(GameActionInfo.class)))
                .via((GameActionInfo gInfo) -> KV.of(gInfo.team, gInfo)))
        // Outputs a team's score every time it passes a new multiple of the threshold.
        .apply("UpdateTeamScore", ParDo.of(new UpdateTeamScoreFn(options.getThresholdScore())))
        // Write the results to BigQuery.
        .apply(
            "WriteTeamLeaders",
            new WriteWindowedToBigQuery<>(
                options.as(GcpOptions.class).getProject(),
                options.getDataset(),
                options.getLeaderBoardTableName() + "_team_leader",
                configureCompleteWindowedTableWrite()));

    // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
    // command line.
    PipelineResult result = pipeline.run();
    exampleUtils.waitToFinish(result);
  }
 
Example 18
Source Project: beam   Source File: GenerateSequenceTable.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Row> buildIOReader(PBegin begin) {
  return begin
      .apply(GenerateSequence.from(0).withRate(elementsPerSecond, Duration.standardSeconds(1)))
      .apply(
          MapElements.into(TypeDescriptor.of(Row.class))
              .via(elm -> Row.withSchema(TABLE_SCHEMA).addValues(elm, Instant.now()).build()))
      .setRowSchema(getSchema());
}
 
Example 19
Source Project: beam   Source File: QueryablePipelineTest.java    License: Apache License 2.0 5 votes vote down vote up
/** This method doesn't do any pruning for reachability, but this may not require a test. */
@Test
public void retainOnlyPrimitivesIgnoresUnreachableNodes() {
  Pipeline p = Pipeline.create();
  p.apply(
      new org.apache.beam.sdk.transforms.PTransform<PBegin, PCollection<Long>>() {
        @Override
        public PCollection<Long> expand(PBegin input) {
          return input
              .apply(GenerateSequence.from(2L))
              .apply(Window.into(FixedWindows.of(Duration.standardMinutes(5L))))
              .apply(MapElements.into(TypeDescriptors.longs()).via(l -> l + 1));
        }
      });

  Components augmentedComponents =
      PipelineTranslation.toProto(p)
          .getComponents()
          .toBuilder()
          .putCoders("extra-coder", RunnerApi.Coder.getDefaultInstance())
          .putWindowingStrategies(
              "extra-windowing-strategy", RunnerApi.WindowingStrategy.getDefaultInstance())
          .putEnvironments("extra-env", RunnerApi.Environment.getDefaultInstance())
          .putPcollections("extra-pc", RunnerApi.PCollection.getDefaultInstance())
          .build();
  Collection<String> primitiveComponents =
      QueryablePipeline.getPrimitiveTransformIds(augmentedComponents);
}
 
Example 20
Source Project: gcp-ingestion   Source File: Read.java    License: Mozilla Public License 2.0 5 votes vote down vote up
@Override
public PCollection<PubsubMessage> expand(PBegin input) {
  return input //
      .apply(PubsubIO.readMessagesWithAttributesAndMessageId().fromSubscription(subscription))
      .apply(MapElements.into(TypeDescriptor.of(PubsubMessage.class)).via(message -> {
        Map<String, String> attributesWithMessageId = new HashMap<>(message.getAttributeMap());
        attributesWithMessageId.put(Attribute.MESSAGE_ID, message.getMessageId());
        return new PubsubMessage(message.getPayload(), attributesWithMessageId);
      }));
}
 
Example 21
Source Project: beam   Source File: SparkPipelineStateTest.java    License: Apache License 2.0 5 votes vote down vote up
private void testFailedPipeline(final SparkPipelineOptions options) throws Exception {

    SparkPipelineResult result = null;

    try {
      final Pipeline pipeline = Pipeline.create(options);
      pipeline
          .apply(getValues(options))
          .setCoder(StringUtf8Coder.of())
          .apply(
              MapElements.via(
                  new SimpleFunction<String, String>() {

                    @Override
                    public String apply(final String input) {
                      throw new MyCustomException(FAILED_THE_BATCH_INTENTIONALLY);
                    }
                  }));

      result = (SparkPipelineResult) pipeline.run();
      result.waitUntilFinish();
    } catch (final Exception e) {
      assertThat(e, instanceOf(Pipeline.PipelineExecutionException.class));
      assertThat(e.getCause(), instanceOf(MyCustomException.class));
      assertThat(e.getCause().getMessage(), is(FAILED_THE_BATCH_INTENTIONALLY));
      assertThat(result.getState(), is(PipelineResult.State.FAILED));
      result.cancel();
      return;
    }

    fail("An injected failure did not affect the pipeline as expected.");
  }
 
Example 22
@Test
public void testErrors() throws Exception {
  // minimal test for throughput of a single document
  ValueProvider<String> metadataLocation = pipeline
      .newProvider(Resources.getResource("pioneer/metadata-local.json").getPath());
  ValueProvider<Boolean> kmsEnabled = pipeline.newProvider(false);
  ValueProvider<Boolean> decompressPayload = pipeline.newProvider(true);

  final List<String> input = readTestFiles(Arrays.asList("pioneer/study-foo.ciphertext.json",
      "pioneer/study-foo.ciphertext.json", "pioneer/study-foo.ciphertext.json"));
  input.set(0, modifyEncryptionKeyId(input.get(0), "invalid-key")); // IOException
  input.set(1, modifyEncryptionKeyId(input.get(1), "study-bar")); // JoseException
  input.set(2, removeRequiredSchemaNamespace(input.get(2))); // ValidationException

  Result<PCollection<PubsubMessage>, PubsubMessage> result = pipeline.apply(Create.of(input))
      .apply(InputFileFormat.text.decode())
      .apply("AddAttributes", MapElements.into(TypeDescriptor.of(PubsubMessage.class))
          .via(element -> new PubsubMessage(element.getPayload(),
              ImmutableMap.of(Attribute.DOCUMENT_NAMESPACE, "telemetry", Attribute.DOCUMENT_TYPE,
                  "pioneer-study", Attribute.DOCUMENT_VERSION, "4"))))
      .apply(DecryptPioneerPayloads.of(metadataLocation, kmsEnabled, decompressPayload));

  PAssert.that(result.output()).empty();

  PCollection<String> exceptions = result.failures().apply(MapElements
      .into(TypeDescriptors.strings()).via(message -> message.getAttribute("exception_class")));
  // IntegrityException extends JoseException
  PAssert.that(exceptions).containsInAnyOrder("java.io.IOException",
      "org.jose4j.lang.JoseException", "org.everit.json.schema.ValidationException");

  pipeline.run();
}
 
Example 23
Source Project: beam   Source File: SingleInputOutputOverrideFactoryTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
public void testMapOutputsMultipleOriginalOutputsFails() {
  PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3));
  PCollection<Integer> output = input.apply("Map", MapElements.via(fn));
  PCollection<Integer> reappliedOutput = input.apply("ReMap", MapElements.via(fn));
  thrown.expect(IllegalArgumentException.class);
  factory.mapOutputs(
      PCollectionList.of(output).and(input).and(reappliedOutput).expand(), reappliedOutput);
}
 
Example 24
Source Project: beam   Source File: PipelineRunnerTest.java    License: Apache License 2.0 5 votes vote down vote up
@Test
@Category({NeedsRunner.class, UsesCommittedMetrics.class, UsesCounterMetrics.class})
public void testRunPTransform() {
  final String namespace = PipelineRunnerTest.class.getName();
  final Counter counter = Metrics.counter(namespace, "count");
  final PipelineResult result =
      PipelineRunner.fromOptions(p.getOptions())
          .run(
              new PTransform<PBegin, POutput>() {
                @Override
                public POutput expand(PBegin input) {
                  PCollection<Double> output =
                      input
                          .apply(Create.of(1, 2, 3, 4))
                          .apply("ScaleByTwo", MapElements.via(new ScaleFn<>(2.0, counter)));
                  PAssert.that(output).containsInAnyOrder(2.0, 4.0, 6.0, 8.0);
                  return output;
                }
              });

  // Checking counters to verify the pipeline actually ran.
  assertThat(
      result
          .metrics()
          .queryMetrics(
              MetricsFilter.builder()
                  .addNameFilter(MetricNameFilter.inNamespace(namespace))
                  .build())
          .getCounters(),
      hasItem(metricsResult(namespace, "count", "ScaleByTwo", 4L, true)));
}
 
Example 25
Source Project: beam   Source File: FlinkRequiresStableInputTest.java    License: Apache License 2.0 5 votes vote down vote up
private static Pipeline createPipeline(
    PipelineOptions options, String singleOutputPrefix, String multiOutputPrefix) {
  Pipeline p = Pipeline.create(options);

  SerializableFunction<Void, Void> firstTime =
      (SerializableFunction<Void, Void>)
          value -> {
            latch.countDown();
            return null;
          };

  PCollection<String> impulse = p.apply("CreatePCollectionOfOneValue", Create.of(VALUE));
  impulse
      .apply(
          "Single-PairWithRandomKey",
          MapElements.via(new RequiresStableInputIT.PairWithRandomKeyFn()))
      .apply(
          "Single-MakeSideEffectAndThenFail",
          ParDo.of(
              new RequiresStableInputIT.MakeSideEffectAndThenFailFn(
                  singleOutputPrefix, firstTime)));
  impulse
      .apply(
          "Multi-PairWithRandomKey",
          MapElements.via(new RequiresStableInputIT.PairWithRandomKeyFn()))
      .apply(
          "Multi-MakeSideEffectAndThenFail",
          ParDo.of(
                  new RequiresStableInputIT.MakeSideEffectAndThenFailFn(
                      multiOutputPrefix, firstTime))
              .withOutputTags(new TupleTag<>(), TupleTagList.empty()));

  return p;
}
 
Example 26
Source Project: feast   Source File: ToOrderedFeatureRows.java    License: Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<FeatureRow> expand(PCollection<FeatureRowExtended> input) {
  return input
      .apply(
          "get rows",
          MapElements.into(TypeDescriptor.of(FeatureRow.class)).via(FeatureRowExtended::getRow))
      .apply(
          "normalize rows",
          MapElements.into(TypeDescriptor.of(FeatureRow.class))
              .via(ToOrderedFeatureRows::orderedFeatureRow));
}
 
Example 27
Source Project: deployment-examples   Source File: WordCount.java    License: MIT License 5 votes vote down vote up
static void runWordCount(WordCountOptions options) {
  Pipeline p = Pipeline.create(options);

  // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
  // static FormatAsTextFn() to the ParDo transform.
  p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
      .apply(new CountWords())
      .apply(MapElements.via(new FormatAsTextFn()))
      .apply("WriteCounts", TextIO.write().to(options.getOutput()));

  p.run().waitUntilFinish();
}
 
Example 28
Source Project: deployment-examples   Source File: UserScore.java    License: MIT License 5 votes vote down vote up
@Override
public PCollection<KV<String, Integer>> expand(PCollection<GameActionInfo> gameInfo) {

  return gameInfo
      .apply(
          MapElements.into(
                  TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
              .via((GameActionInfo gInfo) -> KV.of(gInfo.getKey(field), gInfo.getScore())))
      .apply(Sum.integersPerKey());
}
 
Example 29
Source Project: beam   Source File: TextTableProviderTest.java    License: Apache License 2.0 5 votes vote down vote up
/**
 * Tests {@code CREATE EXTERNAL TABLE TYPE text} with a format other than "csv" or "lines" results
 * in a CSV read of that format.
 */
@Test
public void testLegacyTdfCsv() throws Exception {
  Files.write(
      tempFolder.newFile("test.csv").toPath(),
      "hello\t13\n\ngoodbye\t42\n".getBytes(Charsets.UTF_8));

  BeamSqlEnv env = BeamSqlEnv.inMemory(new TextTableProvider());
  env.executeDdl(
      String.format(
          "CREATE EXTERNAL TABLE test %s TYPE text LOCATION '%s/*' TBLPROPERTIES '{\"format\":\"TDF\"}'",
          SQL_CSV_SCHEMA, tempFolder.getRoot()));

  PCollection<Row> rows =
      BeamSqlRelUtils.toPCollection(pipeline, env.parseQuery("SELECT * FROM test"));

  rows.apply(
      MapElements.into(TypeDescriptors.voids())
          .via(
              r -> {
                System.out.println(r.toString());
                return null;
              }));

  PAssert.that(rows)
      .containsInAnyOrder(
          Row.withSchema(CSV_SCHEMA).addValues("hello", 13).build(),
          Row.withSchema(CSV_SCHEMA).addValues("goodbye", 42).build());
  pipeline.run();
}
 
Example 30
Source Project: beam   Source File: WordCount.java    License: Apache License 2.0 5 votes vote down vote up
static void runWordCount(WordCountOptions options) {
  Pipeline p = Pipeline.create(options);

  // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
  // static FormatAsTextFn() to the ParDo transform.
  p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
      .apply(new CountWords())
      .apply(MapElements.via(new FormatAsTextFn()))
      .apply("WriteCounts", TextIO.write().to(options.getOutput()));

  p.run().waitUntilFinish();
}