org.apache.beam.sdk.transforms.MapElements Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.MapElements. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: WordCount.java    From java-docs-samples with Apache License 2.0 7 votes vote down vote up
public static void main(String[] args) {
  WordCountOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation().as(WordCountOptions.class);

  Pipeline pipeline = Pipeline.create(options);
  pipeline
      .apply("Read lines", TextIO.read().from(options.getInputFile()))
      // [END value_provider]
      .apply("Find words", FlatMapElements.into(TypeDescriptors.strings())
          .via((String line) -> Arrays.asList(line.split("[^\\p{L}]+"))))
      .apply("Filter empty words", Filter.by((String word) -> !word.isEmpty()))
      .apply("Filter with substring", ParDo.of(new FilterWithSubstring(
          options.getWithSubstring(), options.getIsCaseSensitive())))
      .apply("Count words", Count.perElement())
      .apply("Format results", MapElements.into(TypeDescriptors.strings())
          .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue()))
      // [START nested_value_provider]
      .apply("Write results", TextIO.write().to(NestedValueProvider.of(
          options.getOutputBucket(),
          (String bucket) -> String.format("gs://%s/samples/dataflow/wordcount/outputs", bucket)
      )));
      // [END nested_value_provider]
  pipeline.run();
}
 
Example #2
Source File: MusicBrainzTransformsTest.java    From bigquery-etl-dataflow-sample with Apache License 2.0 6 votes vote down vote up
@org.junit.Test
public void loadArtistCreditsByKey() {

  TestPipeline p = TestPipeline.create().enableAbandonedNodeEnforcement(false);

  Long artistCreditIds[] = {634509L, 846332L};
  PCollection<String> text = p.apply(Create.of(artistCreditLinesOfJson)).setCoder(StringUtf8Coder.of());
  PCollection<KV<Long, MusicBrainzDataObject>> artistCredits = 
      MusicBrainzTransforms.loadTableFromText(text, "artist_credit_name", "artist_credit");

  PCollection<Long> artistCreditIdPCollection =
      artistCredits.apply(MapElements
        .into(new TypeDescriptor<Long>() {})
        .via((KV<Long, MusicBrainzDataObject> kv) -> {
            Long k = kv.getKey();
            return k;
          })
      );
  PAssert.that(artistCreditIdPCollection).containsInAnyOrder(634509L, 846332L);
}
 
Example #3
Source File: KeyByBigQueryTableDestination.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Override
public Result<PCollection<KV<TableDestination, PubsubMessage>>, PubsubMessage> expand(
    PCollection<PubsubMessage> messages) {
  return messages
      .apply(MapElements.into(TypeDescriptors.kvs(TypeDescriptor.of(TableDestination.class),
          TypeDescriptor.of(PubsubMessage.class))).via((PubsubMessage msg) -> {
            msg = PubsubConstraints.ensureNonNull(msg);
            return KV.of(getTableDestination(msg.getAttributeMap()), msg);
          }).exceptionsInto(TypeDescriptor.of(PubsubMessage.class))
          .exceptionsVia((WithFailures.ExceptionElement<PubsubMessage> ee) -> {
            try {
              throw ee.exception();
            } catch (IllegalArgumentException e) {
              return FailureMessage.of(KeyByBigQueryTableDestination.class.getSimpleName(), //
                  ee.element(), //
                  ee.exception());
            }
          }));
}
 
Example #4
Source File: SparkNativePipelineVisitor.java    From beam with Apache License 2.0 6 votes vote down vote up
private String replaceFnString(
    Class<? extends PTransform> transformClass, String transformString, String fnFieldName)
    throws IllegalAccessException, InvocationTargetException, NoSuchMethodException,
        NoSuchFieldException {
  Object fn =
      transformClass.getMethod("get" + StringUtils.capitalize(fnFieldName)).invoke(transform);
  Class<?> fnClass = fn.getClass();
  String doFnName;
  Class<?> enclosingClass = fnClass.getEnclosingClass();
  if (enclosingClass != null && enclosingClass.equals(MapElements.class)) {
    Field parent = fnClass.getDeclaredField("this$0");
    parent.setAccessible(true);
    Field fnField = enclosingClass.getDeclaredField(fnFieldName);
    fnField.setAccessible(true);
    doFnName = fnField.get(parent.get(fn)).getClass().getName();
  } else {
    doFnName = fnClass.getName();
  }
  transformString = transformString.replace("<" + fnFieldName + ">", doFnName);
  return transformString;
}
 
Example #5
Source File: LimitPayloadSize.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
/** Factory method to create mapper instance. */
public static MapWithFailures<PubsubMessage, PubsubMessage, PubsubMessage> of(int maxBytes) {
  final Counter countPayloadTooLarge = Metrics.counter(LimitPayloadSize.class,
      "payload_too_large");
  return MapElements.into(TypeDescriptor.of(PubsubMessage.class)).via((PubsubMessage msg) -> {
    msg = PubsubConstraints.ensureNonNull(msg);
    int numBytes = msg.getPayload().length;
    if (numBytes > maxBytes) {
      countPayloadTooLarge.inc();
      throw new PayloadTooLargeException("Message payload is " + numBytes
          + "bytes, larger than the" + " configured limit of " + maxBytes);
    }
    return msg;
  }).exceptionsInto(TypeDescriptor.of(PubsubMessage.class))
      .exceptionsVia((ExceptionElement<PubsubMessage> ee) -> {
        try {
          throw ee.exception();
        } catch (PayloadTooLargeException e) {
          return FailureMessage.of(LimitPayloadSize.class.getSimpleName(), ee.element(),
              ee.exception());
        }
      });
}
 
Example #6
Source File: RowToPubsubMessage.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<PubsubMessage> expand(PCollection<Row> input) {
  PCollection<Row> withTimestamp =
      (config.useTimestampAttribute())
          ? input.apply(
              WithTimestamps.of((row) -> row.getDateTime("event_timestamp").toInstant()))
          : input;

  return withTimestamp
      .apply(DropFields.fields("event_timestamp"))
      .apply(ToJson.of())
      .apply(
          MapElements.into(TypeDescriptor.of(PubsubMessage.class))
              .via(
                  (String json) ->
                      new PubsubMessage(
                          json.getBytes(StandardCharsets.ISO_8859_1), ImmutableMap.of())));
}
 
Example #7
Source File: LimitPayloadSizeTest.java    From gcp-ingestion with Mozilla Public License 2.0 6 votes vote down vote up
@Test
public void testLimit() {
  List<String> passingPayloads = ImmutableList.of("", "abcdefg",
      StringUtils.repeat("abcdefg", 50));
  List<String> failingPayloads = ImmutableList.of(StringUtils.repeat("abcdefghij", 51));

  WithFailures.Result<PCollection<PubsubMessage>, PubsubMessage> result = pipeline //
      .apply(Create.of(Iterables.concat(passingPayloads, failingPayloads))) //
      .apply(InputFileFormat.text.decode()) //
      .apply("LimitPayloadSize", LimitPayloadSize.toBytes(500));

  PAssert
      .that(result.output().apply("get success payload",
          MapElements.into(TypeDescriptors.strings()).via(m -> new String(m.getPayload())))) //
      .containsInAnyOrder(passingPayloads);
  PAssert
      .that(result.failures().apply("get failure payload",
          MapElements.into(TypeDescriptors.strings()).via(m -> new String(m.getPayload())))) //
      .containsInAnyOrder(failingPayloads);

  pipeline.run();
}
 
Example #8
Source File: ValidateRunnerXlangTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category({ValidatesRunner.class, UsesCrossLanguageTransforms.class})
public void coGroupByKeyTest() {
  PCollection<KV<Long, String>> col1 =
      testPipeline.apply("createCol1", Create.of(KV.of(0L, "1"), KV.of(0L, "2"), KV.of(1L, "3")));
  PCollection<KV<Long, String>> col2 =
      testPipeline.apply("createCol2", Create.of(KV.of(0L, "4"), KV.of(1L, "5"), KV.of(1L, "6")));
  PCollection<KV<Long, Iterable<String>>> cgbkCol =
      KeyedPCollectionTuple.of("col1", col1)
          .and("col2", col2)
          .apply(External.of(TEST_CGBK_URN, new byte[] {}, expansionAddr));
  PCollection<String> col =
      cgbkCol.apply(
          MapElements.into(TypeDescriptors.strings())
              .via(
                  (KV<Long, Iterable<String>> kv) -> {
                    String[] values = Iterables.toArray(kv.getValue(), String.class);
                    Arrays.sort(values);
                    return String.format("%s:%s", kv.getKey(), String.join(",", values));
                  }));
  PAssert.that(col).containsInAnyOrder("0:1,2,4", "1:3,5,6");
}
 
Example #9
Source File: ValidateRunnerXlangTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
@Category({ValidatesRunner.class, UsesCrossLanguageTransforms.class})
public void groupByKeyTest() {
  PCollection<KV<Long, Iterable<String>>> gbkCol =
      testPipeline
          .apply(Create.of(KV.of(0L, "1"), KV.of(0L, "2"), KV.of(1L, "3")))
          .apply(External.of(TEST_GBK_URN, new byte[] {}, expansionAddr));
  PCollection<String> col =
      gbkCol.apply(
          MapElements.into(TypeDescriptors.strings())
              .via(
                  (KV<Long, Iterable<String>> kv) -> {
                    String[] values = Iterables.toArray(kv.getValue(), String.class);
                    Arrays.sort(values);
                    return String.format("%s:%s", kv.getKey(), String.join(",", values));
                  }));
  PAssert.that(col).containsInAnyOrder("0:1,2", "1:3");
}
 
Example #10
Source File: BigQueryIOWriteTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testCreateNeverWithStreaming() throws Exception {
  p.enableAbandonedNodeEnforcement(false);

  TableReference tableRef = new TableReference();
  tableRef.setDatasetId("dataset");
  tableRef.setTableId("sometable");

  PCollection<TableRow> tableRows =
      p.apply(GenerateSequence.from(0))
          .apply(
              MapElements.via(
                  new SimpleFunction<Long, TableRow>() {
                    @Override
                    public TableRow apply(Long input) {
                      return null;
                    }
                  }))
          .setCoder(TableRowJsonCoder.of());
  tableRows.apply(
      BigQueryIO.writeTableRows()
          .to(tableRef)
          .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_NEVER)
          .withoutValidation());
}
 
Example #11
Source File: BeamSqlBuiltinFunctionsIntegrationTestBase.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PDone expand(PBegin begin) {
  PCollection<Boolean> result =
      begin
          .apply(Create.of(DUMMY_ROW).withRowSchema(DUMMY_SCHEMA))
          .apply(SqlTransform.query("SELECT " + expr))
          .apply(MapElements.into(TypeDescriptors.booleans()).via(row -> row.getBoolean(0)));

  PAssert.that(result)
      .satisfies(
          input -> {
            assertTrue("Test expression is false: " + expr, Iterables.getOnlyElement(input));
            return null;
          });
  return PDone.in(begin.getPipeline());
}
 
Example #12
Source File: Window.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<T> input) {
  WindowingStrategy<?, ?> outputWindowingStrategy =
      getOutputWindowing(input.getWindowingStrategy());

  return input
      // We first apply a (trivial) transform to the input PCollection to produce a new
      // PCollection. This ensures that we don't modify the windowing strategy of the input
      // which may be used elsewhere.
      .apply(
          "Identity",
          MapElements.via(
              new SimpleFunction<T, T>() {
                @Override
                public T apply(T element) {
                  return element;
                }
              }))
      // Then we modify the windowing strategy.
      .setWindowingStrategyInternal(outputWindowingStrategy);
}
 
Example #13
Source File: BeamJdbcAvroSchema.java    From dbeam with Apache License 2.0 6 votes vote down vote up
/** Generate Avro schema by reading one row. Expose Beam metrics via a Beam PTransform. */
public static Schema createSchema(
    final Pipeline pipeline, final JdbcExportArgs args, final Connection connection)
    throws Exception {
  final long startTime = System.nanoTime();
  final Schema generatedSchema = generateAvroSchema(args, connection);
  final long elapsedTimeSchema = (System.nanoTime() - startTime) / 1000000;
  LOGGER.info("Elapsed time to schema {} seconds", elapsedTimeSchema / 1000.0);

  final Counter cnt =
      Metrics.counter(BeamJdbcAvroSchema.class.getCanonicalName(), "schemaElapsedTimeMs");
  pipeline
      .apply(
          "ExposeSchemaCountersSeed",
          Create.of(Collections.singletonList(0)).withType(TypeDescriptors.integers()))
      .apply(
          "ExposeSchemaCounters",
          MapElements.into(TypeDescriptors.integers())
              .via(
                  v -> {
                    cnt.inc(elapsedTimeSchema);
                    return v;
                  }));
  return generatedSchema;
}
 
Example #14
Source File: WordCount.java    From incubator-nemo with Apache License 2.0 6 votes vote down vote up
/**
 * Static method to generate the word count Beam pipeline.
 * @param options options for the pipeline.
 * @param inputFilePath the input file path.
 * @param outputFilePath the output file path.
 * @return the generated pipeline.
 */
static Pipeline generateWordCountPipeline(final PipelineOptions options,
                                                 final String inputFilePath, final String outputFilePath) {
  final Pipeline p = Pipeline.create(options);
  final PCollection<String> result = GenericSourceSink.read(p, inputFilePath)
    .apply(MapElements.<String, KV<String, Long>>via(new SimpleFunction<String, KV<String, Long>>() {
      @Override
      public KV<String, Long> apply(final String line) {
        final String[] words = line.split(" +");
        final String documentId = words[0] + "#" + words[1];
        final Long count = Long.parseLong(words[2]);
        return KV.of(documentId, count);
      }
    }))
    .apply(Sum.longsPerKey())
    .apply(MapElements.<KV<String, Long>, String>via(new SimpleFunction<KV<String, Long>, String>() {
      @Override
      public String apply(final KV<String, Long> kv) {
        return kv.getKey() + ": " + kv.getValue();
      }
    }));
  GenericSourceSink.write(result, outputFilePath);
  return p;
}
 
Example #15
Source File: TextImportTransform.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<ImportManifest> expand(PBegin input) {
  return input
      .apply("Read manifest", FileIO.match().filepattern(importManifest))
      .apply(
          "Resource id",
          MapElements.into(TypeDescriptor.of(ResourceId.class))
              .via((MatchResult.Metadata::resourceId)))
      .apply(
          "Read manifest json",
          MapElements.into(TypeDescriptor.of(ImportManifest.class))
              .via(ReadImportManifest::readManifest));
}
 
Example #16
Source File: BigtableToAvroTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Test
public void applyBigtableToAvroFn() throws Exception {
  Row bigtableRow1 = createBigtableRow("row1");
  bigtableRow1 = upsertBigtableCell(bigtableRow1, "family1", "column1", 1, "value1");
  bigtableRow1 = upsertBigtableCell(bigtableRow1, "family1", "column1", 2, "value2");
  bigtableRow1 = upsertBigtableCell(bigtableRow1, "family1", "column2", 1, "value3");
  bigtableRow1 = upsertBigtableCell(bigtableRow1, "family2", "column1", 1, "value4");
  Row bigtableRow2 = createBigtableRow("row2");
  bigtableRow2 = upsertBigtableCell(bigtableRow2, "family2", "column2", 1, "value2");
  final List<Row> bigtableRows = ImmutableList.of(bigtableRow1, bigtableRow2);

  BigtableRow avroRow1 = createAvroRow("row1");
  addAvroCell(avroRow1, "family1", "column1", 1, "value1");
  // Expect a new cell due to a different timestamp of "2".
  addAvroCell(avroRow1, "family1", "column1", 2, "value2");
  // Expect a new cell due to a different column of "column2".
  addAvroCell(avroRow1, "family1", "column2", 1, "value3");
  // Expect a new cell due to a different family of "family2".
  addAvroCell(avroRow1, "family2", "column1", 1, "value4");
  BigtableRow avroRow2 = createAvroRow("row2");
  addAvroCell(avroRow2, "family2", "column2", 1, "value2");
  final List<BigtableRow> expectedAvroRows = ImmutableList.of(avroRow1, avroRow2);

  PCollection<BigtableRow> avroRows =
      pipeline
          .apply("Create", Create.of(bigtableRows))
          .apply("Transform to Avro", MapElements.via(new BigtableToAvroFn()));

  PAssert.that(avroRows).containsInAnyOrder(expectedAvroRows);
  pipeline.run();
}
 
Example #17
Source File: WordCount.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) {
  WordCountOptions options =
      PipelineOptionsFactory.fromArgs(args).withValidation().as(WordCountOptions.class);
  Pipeline p = Pipeline.create(options);

  // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
  // static FormatAsTextFn() to the ParDo transform.
  p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
      .apply(new CountWords())
      .apply(MapElements.via(new FormatAsTextFn()))
      .apply("WriteCounts", TextIO.write().to(options.getOutput()));

  p.run().waitUntilFinish();
}
 
Example #18
Source File: ImportTransform.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Export> expand(PBegin input) {
  NestedValueProvider<String, String> manifestFile =
      NestedValueProvider.of(importDirectory, s -> GcsUtil.joinPath(s, "spanner-export.json"));
  return input
      .apply("Read manifest", FileIO.match().filepattern(manifestFile))
      .apply(
          "Resource id",
          MapElements.into(TypeDescriptor.of(ResourceId.class))
              .via((MatchResult.Metadata::resourceId)))
      .apply(
          "Read manifest json",
          MapElements.into(TypeDescriptor.of(Export.class))
              .via(ReadExportManifestFile::readManifest));
}
 
Example #19
Source File: BeamSqlPojoExample.java    From beam with Apache License 2.0 5 votes vote down vote up
private static MapElements<Row, Void> logRecords(String suffix) {
  return MapElements.via(
      new SimpleFunction<Row, Void>() {
        @Override
        public Void apply(Row input) {
          System.out.println(input.getValues() + suffix);
          return null;
        }
      });
}
 
Example #20
Source File: BigtableToAvro.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
public static PipelineResult run(Options options) {
  Pipeline pipeline = Pipeline.create(options);

  BigtableIO.Read read =
      BigtableIO.read()
          .withProjectId(options.getBigtableProjectId())
          .withInstanceId(options.getBigtableInstanceId())
          .withTableId(options.getBigtableTableId());

  // Do not validate input fields if it is running as a template.
  if (options.as(DataflowPipelineOptions.class).getTemplateLocation() != null) {
    read = read.withoutValidation();
  }

  ValueProvider<String> filePathPrefix = DualInputNestedValueProvider.of(
      options.getOutputDirectory(),
      options.getFilenamePrefix(),
      new SerializableFunction<TranslatorInput<String, String>, String>() {
        @Override
        public String apply(TranslatorInput<String, String> input) {
          return new StringBuilder(input.getX()).append(input.getY()).toString();
        }
      });

  pipeline
      .apply("Read from Bigtable", read)
      .apply("Transform to Avro", MapElements.via(new BigtableToAvroFn()))
      .apply(
          "Write to Avro in GCS",
          AvroIO.write(BigtableRow.class).to(filePathPrefix).withSuffix(".avro"));

  return pipeline.run();
}
 
Example #21
Source File: PAssert.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> input) {
  input
      .apply("GroupGlobally", new GroupGlobally<>(rewindowingStrategy))
      .apply("GetPane", MapElements.via(paneExtractor))
      .setCoder(IterableCoder.of(input.getCoder()))
      .apply("RunChecks", ParDo.of(new GroupedValuesCheckerDoFn<>(checkerFn, site)))
      .apply("VerifyAssertions", new DefaultConcludeTransform());

  return PDone.in(input.getPipeline());
}
 
Example #22
Source File: TestPipelineTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("UnusedReturnValue")
private static PCollection<String> addTransform(final PCollection<String> pCollection) {
  return pCollection.apply(
      "Map2",
      MapElements.via(
          new SimpleFunction<String, String>() {

            @Override
            public String apply(final String input) {
              return WHATEVER;
            }
          }));
}
 
Example #23
Source File: BigQueryDynamicConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<KV<TableId, TableRow>> expand(PCollection<TableRow> tableRowPCollection) {
    return tableRowPCollection.apply(
        "TableRowExtractDestination",
        MapElements.via(
            new SimpleFunction<TableRow, KV<TableId, TableRow>>() {
            @Override
            public KV<TableId, TableRow> apply(TableRow row) {
                TableId tableId = getTableId(row);
                TableRow resultTableRow = cleanTableRow(row.clone());

                return KV.of(tableId, resultTableRow);
            }
    }));
}
 
Example #24
Source File: ErrorConverters.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<PubsubMessage> expand(PCollection<BigQueryInsertError> errors) {

  TypeDescriptor<PubsubMessage> messageTypeDescriptor
      = new TypeDescriptor<PubsubMessage>() {};

  TypeDescriptor<String> stringTypeDescriptor = TypeDescriptors.strings();

  return
      errors
          .apply(
              "ConvertErrorPayload",
              MapElements.into(
                  TypeDescriptors.kvs(
                      payloadCoder().getEncodedTypeDescriptor(),
                      TypeDescriptors.maps(stringTypeDescriptor, stringTypeDescriptor)))
                  .via(new BigQueryInsertErrorToKv()))
          .setCoder(
              KvCoder.of(
                  payloadCoder(),
                  MapCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of())))

          .apply(
              "ConvertToPubsubMessage",
              MapElements
                  .into(messageTypeDescriptor)
                  .via(new KvToPubsubMessage()));
}
 
Example #25
Source File: AsJsons.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public WithFailures.Result<PCollection<String>, FailureT> expand(PCollection<InputT> input) {
  return input.apply(
      MapElements.into(TypeDescriptors.strings())
          .via(
              Contextful.fn(
                  (Contextful.Fn<InputT, String>) (input1, c) -> writeValue(input1),
                  Requirements.empty()))
          .exceptionsInto(failureType)
          .exceptionsVia(exceptionHandler));
}
 
Example #26
Source File: DecodeRows.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
public PCollection<Row> expand(PCollection<byte[]> input) {
  return input.apply(MapElements.into(TypeDescriptors.rows())
      .via(elm -> {
        ByteArrayInputStream bis = new ByteArrayInputStream(elm);
        try {
          return coder.decode(bis);
        } catch (IOException e) {
          throw new RuntimeException(e);
        }
      }))
      .setRowSchema(this.schema);
}
 
Example #27
Source File: StatefulTeamScore.java    From beam with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    // Enforce that this pipeline is always run in streaming mode.
    options.setStreaming(true);
    ExampleUtils exampleUtils = new ExampleUtils(options);
    Pipeline pipeline = Pipeline.create(options);

    pipeline
        // Read game events from Pub/Sub using custom timestamps, which are extracted from the
        // pubsub data elements, and parse the data.
        .apply(
            PubsubIO.readStrings()
                .withTimestampAttribute(GameConstants.TIMESTAMP_ATTRIBUTE)
                .fromTopic(options.getTopic()))
        .apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
        // Create <team, GameActionInfo> mapping. UpdateTeamScore uses team name as key.
        .apply(
            "MapTeamAsKey",
            MapElements.into(
                    TypeDescriptors.kvs(
                        TypeDescriptors.strings(), TypeDescriptor.of(GameActionInfo.class)))
                .via((GameActionInfo gInfo) -> KV.of(gInfo.team, gInfo)))
        // Outputs a team's score every time it passes a new multiple of the threshold.
        .apply("UpdateTeamScore", ParDo.of(new UpdateTeamScoreFn(options.getThresholdScore())))
        // Write the results to BigQuery.
        .apply(
            "WriteTeamLeaders",
            new WriteWindowedToBigQuery<>(
                options.as(GcpOptions.class).getProject(),
                options.getDataset(),
                options.getLeaderBoardTableName() + "_team_leader",
                configureCompleteWindowedTableWrite()));

    // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
    // command line.
    PipelineResult result = pipeline.run();
    exampleUtils.waitToFinish(result);
  }
 
Example #28
Source File: WordCount.java    From beam with Apache License 2.0 5 votes vote down vote up
static void runWordCount(WordCountOptions options) {
  Pipeline p = Pipeline.create(options);

  // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
  // static FormatAsTextFn() to the ParDo transform.
  p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
      .apply(new CountWords())
      .apply(MapElements.via(new FormatAsTextFn()))
      .apply("WriteCounts", TextIO.write().to(options.getOutput()));

  p.run().waitUntilFinish();
}
 
Example #29
Source File: TextTableProviderTest.java    From beam with Apache License 2.0 5 votes vote down vote up
/**
 * Tests {@code CREATE EXTERNAL TABLE TYPE text} with a format other than "csv" or "lines" results
 * in a CSV read of that format.
 */
@Test
public void testLegacyTdfCsv() throws Exception {
  Files.write(
      tempFolder.newFile("test.csv").toPath(),
      "hello\t13\n\ngoodbye\t42\n".getBytes(Charsets.UTF_8));

  BeamSqlEnv env = BeamSqlEnv.inMemory(new TextTableProvider());
  env.executeDdl(
      String.format(
          "CREATE EXTERNAL TABLE test %s TYPE text LOCATION '%s/*' TBLPROPERTIES '{\"format\":\"TDF\"}'",
          SQL_CSV_SCHEMA, tempFolder.getRoot()));

  PCollection<Row> rows =
      BeamSqlRelUtils.toPCollection(pipeline, env.parseQuery("SELECT * FROM test"));

  rows.apply(
      MapElements.into(TypeDescriptors.voids())
          .via(
              r -> {
                System.out.println(r.toString());
                return null;
              }));

  PAssert.that(rows)
      .containsInAnyOrder(
          Row.withSchema(CSV_SCHEMA).addValues("hello", 13).build(),
          Row.withSchema(CSV_SCHEMA).addValues("goodbye", 42).build());
  pipeline.run();
}
 
Example #30
Source File: PubsubIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<T> input) {
  if (getTopicProvider() == null) {
    throw new IllegalStateException("need to set the topic of a PubsubIO.Write transform");
  }

  switch (input.isBounded()) {
    case BOUNDED:
      input.apply(
          ParDo.of(
              new PubsubBoundedWriter(
                  MoreObjects.firstNonNull(getMaxBatchSize(), MAX_PUBLISH_BATCH_SIZE),
                  MoreObjects.firstNonNull(
                      getMaxBatchBytesSize(), MAX_PUBLISH_BATCH_BYTE_SIZE_DEFAULT))));
      return PDone.in(input.getPipeline());
    case UNBOUNDED:
      return input
          .apply(MapElements.into(new TypeDescriptor<PubsubMessage>() {}).via(getFormatFn()))
          .apply(
              new PubsubUnboundedSink(
                  getPubsubClientFactory(),
                  NestedValueProvider.of(getTopicProvider(), new TopicPathTranslator()),
                  getTimestampAttribute(),
                  getIdAttribute(),
                  100 /* numShards */,
                  MoreObjects.firstNonNull(
                      getMaxBatchSize(), PubsubUnboundedSink.DEFAULT_PUBLISH_BATCH_SIZE),
                  MoreObjects.firstNonNull(
                      getMaxBatchBytesSize(),
                      PubsubUnboundedSink.DEFAULT_PUBLISH_BATCH_BYTES)));
  }
  throw new RuntimeException(); // cases are exhaustive.
}