org.apache.beam.sdk.transforms.Filter Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.Filter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: WordCount.java    From java-docs-samples with Apache License 2.0 7 votes vote down vote up
public static void main(String[] args) {
  WordCountOptions options = PipelineOptionsFactory.fromArgs(args)
      .withValidation().as(WordCountOptions.class);

  Pipeline pipeline = Pipeline.create(options);
  pipeline
      .apply("Read lines", TextIO.read().from(options.getInputFile()))
      // [END value_provider]
      .apply("Find words", FlatMapElements.into(TypeDescriptors.strings())
          .via((String line) -> Arrays.asList(line.split("[^\\p{L}]+"))))
      .apply("Filter empty words", Filter.by((String word) -> !word.isEmpty()))
      .apply("Filter with substring", ParDo.of(new FilterWithSubstring(
          options.getWithSubstring(), options.getIsCaseSensitive())))
      .apply("Count words", Count.perElement())
      .apply("Format results", MapElements.into(TypeDescriptors.strings())
          .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue()))
      // [START nested_value_provider]
      .apply("Write results", TextIO.write().to(NestedValueProvider.of(
          options.getOutputBucket(),
          (String bucket) -> String.format("gs://%s/samples/dataflow/wordcount/outputs", bucket)
      )));
      // [END nested_value_provider]
  pipeline.run();
}
 
Example #2
Source File: VerifyBamId.java    From dataflow-java with Apache License 2.0 6 votes vote down vote up
/**
 * Filter, pile up, and sample reads, then join against reference statistics.
 *
 * @param reads A PCollection of reads
 * @param samplingFraction Fraction of reads to keep
 * @param samplingPrefix A prefix used in generating hashes used in sampling
 * @param refCounts A PCollection mapping position to counts of alleles in
 *   a reference population.
 * @return A PCollection mapping Position to a ReadCounts proto
 */
static PCollection<KV<Position, ReadCounts>> combineReads(PCollection<Read> reads,
    double samplingFraction, String samplingPrefix,
    PCollection<KV<Position, AlleleFreq>> refFreq) {
  // Runs filters on input Reads, splits into individual aligned bases (emitting the
  // base and quality) and grabs a sample of them based on a hash mod of Position.
  PCollection<KV<Position, ReadBaseQuality>> joinReadCounts =
      reads.apply("IsOnChromosome", Filter.by(ReadFunctions.IS_ON_CHROMOSOME))
      .apply("IsNotQCFailure", Filter.by(ReadFunctions.IS_NOT_QC_FAILURE))
      .apply("IsNotDuplicate", Filter.by(ReadFunctions.IS_NOT_DUPLICATE))
      .apply("IsProperPlacement", Filter.by(ReadFunctions.IS_PROPER_PLACEMENT))
      .apply(ParDo.of(new SplitReads()))
      .apply(Filter.by(new SampleReads(samplingFraction, samplingPrefix)));

  TupleTag<ReadBaseQuality> readCountsTag = new TupleTag<>();
  TupleTag<AlleleFreq> refFreqTag = new TupleTag<>();
  // Pile up read counts, then join against reference stats.
  PCollection<KV<Position, CoGbkResult>> joined = KeyedPCollectionTuple
      .of(readCountsTag, joinReadCounts)
      .and(refFreqTag, refFreq)
      .apply(CoGroupByKey.<Position>create());
  return joined.apply(ParDo.of(new PileupAndJoinReads(readCountsTag, refFreqTag)));
}
 
Example #3
Source File: RedisIO.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<KV<String, String>> expand(PCollection<KV<String, String>> input) {
  // reparallelize mimics the same behavior as in JdbcIO, used to break fusion
  PCollectionView<Iterable<KV<String, String>>> empty =
      input
          .apply("Consume", Filter.by(SerializableFunctions.constant(false)))
          .apply(View.asIterable());
  PCollection<KV<String, String>> materialized =
      input.apply(
          "Identity",
          ParDo.of(
                  new DoFn<KV<String, String>, KV<String, String>>() {
                    @ProcessElement
                    public void processElement(ProcessContext c) {
                      c.output(c.element());
                    }
                  })
              .withSideInputs(empty));
  return materialized.apply(Reshuffle.viaRandomKey());
}
 
Example #4
Source File: SqlQuery3.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public PCollection<NameCityStateId> expand(PCollection<Event> allEvents) {
  PCollection<Event> windowed =
      allEvents.apply(
          Window.into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec))));

  String auctionName = Auction.class.getSimpleName();
  PCollection<Row> auctions =
      windowed
          .apply(getName() + ".Filter." + auctionName, Filter.by(e1 -> e1.newAuction != null))
          .apply(getName() + ".ToRecords." + auctionName, new SelectEvent(Type.AUCTION));

  String personName = Person.class.getSimpleName();
  PCollection<Row> people =
      windowed
          .apply(getName() + ".Filter." + personName, Filter.by(e -> e.newPerson != null))
          .apply(getName() + ".ToRecords." + personName, new SelectEvent(Type.PERSON));

  PCollectionTuple inputStreams =
      PCollectionTuple.of(new TupleTag<>("Auction"), auctions)
          .and(new TupleTag<>("Person"), people);

  return inputStreams
      .apply(SqlTransform.query(QUERY).withQueryPlannerClass(plannerClass))
      .apply(Convert.fromRows(NameCityStateId.class));
}
 
Example #5
Source File: ExternalTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Override
public Map<String, ExpansionService.TransformProvider> knownTransforms() {
  return ImmutableMap.of(
      TEST_URN_SIMPLE,
          spec -> MapElements.into(TypeDescriptors.strings()).via((String x) -> x + x),
      TEST_URN_LE,
          spec -> Filter.lessThanEq(Integer.parseInt(spec.getPayload().toStringUtf8())),
      TEST_URN_MULTI,
          spec ->
              ParDo.of(
                      new DoFn<Integer, Integer>() {
                        @ProcessElement
                        public void processElement(ProcessContext c) {
                          if (c.element() % 2 == 0) {
                            c.output(c.element());
                          } else {
                            c.output(odd, c.element());
                          }
                        }
                      })
                  .withOutputTags(even, TupleTagList.of(odd)));
}
 
Example #6
Source File: SqlQuery0.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Bid> expand(PCollection<Event> allEvents) {
  PCollection<Row> rows =
      allEvents
          .apply(Filter.by(NexmarkQueryUtil.IS_BID))
          .apply(getName() + ".SelectEvent", new SelectEvent(Type.BID));

  return rows.apply(getName() + ".Serialize", logBytesMetric(rows.getCoder()))
      .setRowSchema(rows.getSchema())
      .apply(SqlTransform.query("SELECT * FROM PCOLLECTION").withQueryPlannerClass(plannerClass))
      .apply(Convert.fromRows(Bid.class));
}
 
Example #7
Source File: FilterRowRuntime.java    From components with Apache License 2.0 5 votes vote down vote up
@Override
public void build(BeamJobContext ctx) {
    String mainLink = ctx.getLinkNameByPortName("input_" + properties.MAIN_CONNECTOR.getName());
    if (!StringUtils.isEmpty(mainLink)) {
        PCollection<IndexedRecord> mainPCollection = ctx.getPCollectionByLinkName(mainLink);
        if (mainPCollection != null) {
            String flowLink = ctx.getLinkNameByPortName("output_" + properties.FLOW_CONNECTOR.getName());
            String rejectLink = ctx.getLinkNameByPortName("output_" + properties.REJECT_CONNECTOR.getName());

            boolean hasFlow = !StringUtils.isEmpty(flowLink);
            boolean hasReject = !StringUtils.isEmpty(rejectLink);

            if (hasFlow && hasReject) {
                // If both of the outputs are present, the DoFn must be used.
                PCollectionTuple outputTuples = mainPCollection.apply(ctx.getPTransformName(),
                        ParDo.of(new FilterRowDoFn(properties)).withOutputTags(flowOutput, TupleTagList.of(rejectOutput)));
                ctx.putPCollectionByLinkName(flowLink, outputTuples.get(flowOutput));
                ctx.putPCollectionByLinkName(rejectLink, outputTuples.get(rejectOutput));
            } else if (hasFlow || hasReject) {
                // If only one of the outputs is present, the predicate can be used for efficiency.
                FilterRowPredicate predicate = hasFlow //
                        ? new FilterRowPredicate(properties) //
                        : new FilterRowPredicate.Negate(properties);
                PCollection<IndexedRecord> output = mainPCollection.apply(ctx.getPTransformName(), Filter.by(predicate));
                ctx.putPCollectionByLinkName(hasFlow ? flowLink : rejectLink, output);
            } else {
                // If neither are specified, then don't do anything. This component could have been cut from the pipeline.
            }
        }
    }
}
 
Example #8
Source File: InvoicingPipeline.java    From nomulus with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<String> expand(PCollection<BillingEvent> input) {
  return input
      .apply(
          "Map to invoicing key",
          MapElements.into(TypeDescriptor.of(InvoiceGroupingKey.class))
              .via(BillingEvent::getInvoiceGroupingKey))
      .apply(Filter.by((InvoiceGroupingKey key) -> key.unitPrice() != 0))
      .setCoder(new InvoiceGroupingKeyCoder())
      .apply("Count occurrences", Count.perElement())
      .apply(
          "Format as CSVs",
          MapElements.into(TypeDescriptors.strings())
              .via((KV<InvoiceGroupingKey, Long> kv) -> kv.getKey().toCsv(kv.getValue())));
}
 
Example #9
Source File: WatermarkManagerTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Before
public void setup() {

  createdInts = p.apply("createdInts", Create.of(1, 2, 3));

  filtered = createdInts.apply("filtered", Filter.greaterThan(1));
  filteredTimesTwo =
      filtered.apply(
          "timesTwo",
          ParDo.of(
              new DoFn<Integer, Integer>() {
                @ProcessElement
                public void processElement(ProcessContext c) throws Exception {
                  c.output(c.element() * 2);
                }
              }));

  keyed = createdInts.apply("keyed", WithKeys.of("MyKey"));

  intsToFlatten = p.apply("intsToFlatten", Create.of(-1, 256, 65535));
  PCollectionList<Integer> preFlatten = PCollectionList.of(createdInts).and(intsToFlatten);
  flattened = preFlatten.apply("flattened", Flatten.pCollections());

  clock = MockClock.fromInstant(new Instant(1000));
  DirectGraphs.performDirectOverrides(p);
  graph = DirectGraphs.getGraph(p);

  manager = WatermarkManager.create(clock, graph, AppliedPTransform::getFullName);
  bundleFactory = ImmutableListBundleFactory.create();
}
 
Example #10
Source File: JdbcIO.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<T> input) {
  // See https://issues.apache.org/jira/browse/BEAM-2803
  // We use a combined approach to "break fusion" here:
  // (see https://cloud.google.com/dataflow/service/dataflow-service-desc#preventing-fusion)
  // 1) force the data to be materialized by passing it as a side input to an identity fn,
  // then 2) reshuffle it with a random key. Initial materialization provides some parallelism
  // and ensures that data to be shuffled can be generated in parallel, while reshuffling
  // provides perfect parallelism.
  // In most cases where a "fusion break" is needed, a simple reshuffle would be sufficient.
  // The current approach is necessary only to support the particular case of JdbcIO where
  // a single query may produce many gigabytes of query results.
  PCollectionView<Iterable<T>> empty =
      input
          .apply("Consume", Filter.by(SerializableFunctions.constant(false)))
          .apply(View.asIterable());
  PCollection<T> materialized =
      input.apply(
          "Identity",
          ParDo.of(
                  new DoFn<T, T>() {
                    @ProcessElement
                    public void process(ProcessContext c) {
                      c.output(c.element());
                    }
                  })
              .withSideInputs(empty));
  return materialized.apply(Reshuffle.viaRandomKey());
}
 
Example #11
Source File: SqlBoundedSideInputJoin.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Bid> expand(PCollection<Event> events) {
  PCollection<Row> bids =
      events
          .apply(Filter.by(NexmarkQueryUtil.IS_BID))
          .apply(getName() + ".SelectEvent", new SelectEvent(Event.Type.BID));

  checkState(getSideInput() != null, "Configuration error: side input is null");

  TupleTag<Row> sideTag = new TupleTag<Row>("side") {};
  TupleTag<Row> bidTag = new TupleTag<Row>("bid") {};

  Schema schema =
      Schema.of(
          Schema.Field.of("id", Schema.FieldType.INT64),
          Schema.Field.of("extra", Schema.FieldType.STRING));

  PCollection<Row> sideRows =
      getSideInput()
          .setSchema(
              schema,
              TypeDescriptors.kvs(TypeDescriptors.longs(), TypeDescriptors.strings()),
              kv -> Row.withSchema(schema).addValues(kv.getKey(), kv.getValue()).build(),
              row -> KV.of(row.getInt64("id"), row.getString("extra")))
          .apply("SideToRows", Convert.toRows());

  return PCollectionTuple.of(bidTag, bids)
      .and(sideTag, sideRows)
      .apply(
          SqlTransform.query(String.format(query, configuration.sideInputRowCount))
              .withQueryPlannerClass(plannerClass))
      .apply("ResultToBid", Convert.fromRows(Bid.class));
}
 
Example #12
Source File: SqlQuery5.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<AuctionCount> expand(PCollection<Event> allEvents) {
  PCollection<Row> bids =
      allEvents
          .apply(Filter.by(NexmarkQueryUtil.IS_BID))
          .apply(getName() + ".SelectEvent", new SelectEvent(Type.BID));

  return PCollectionTuple.of(new TupleTag<>("Bid"), bids)
      .apply(query)
      .apply(Convert.fromRows(AuctionCount.class));
}
 
Example #13
Source File: RepublishPerChannel.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
@Override
public PDone expand(PCollection<PubsubMessage> input) {
  List<Destination> destinations = baseOptions.getPerChannelSampleRatios().entrySet().stream() //
      .map(Destination::new) //
      .collect(Collectors.toList());
  int numDestinations = destinations.size();
  int numPartitions = numDestinations + 1;
  PCollectionList<PubsubMessage> partitioned = input.apply("PartitionByChannel",
      Partition.of(numPartitions, new PartitionFn(destinations)));

  for (int i = 0; i < numDestinations; i++) {
    Destination destination = destinations.get(i);
    RepublisherOptions.Parsed opts = baseOptions.as(RepublisherOptions.Parsed.class);

    // The destination pattern here must be compile-time due to a detail of Dataflow's
    // streaming PubSub producer implementation; if that restriction is lifted in the future,
    // this can become a runtime parameter and we can perform replacement via NestedValueProvider.
    opts.setOutput(StaticValueProvider
        .of(baseOptions.getPerChannelDestination().replace("${channel}", destination.channel)));

    partitioned.get(i) //
        .apply("Sample" + destination.getCapitalizedChannel() + "BySampleIdOrRandomNumber",
            Filter.by(message -> {
              message = PubsubConstraints.ensureNonNull(message);
              String sampleId = message.getAttribute("sample_id");
              return RandomSampler.filterBySampleIdOrRandomNumber(sampleId, destination.ratio);
            }))
        .apply("Republish" + destination.getCapitalizedChannel() + "Sample",
            opts.getOutputType().write(opts));
  }

  return PDone.in(input.getPipeline());
}
 
Example #14
Source File: SqlQuery7.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Bid> expand(PCollection<Event> allEvents) {
  PCollection<Row> bids =
      allEvents
          .apply(Filter.by(NexmarkQueryUtil.IS_BID))
          .apply(getName() + ".SelectEvent", new SelectEvent(Type.BID));

  return PCollectionTuple.of(new TupleTag<>("Bid"), bids)
      .apply(query)
      .apply(Convert.fromRows(Bid.class));
}
 
Example #15
Source File: SqlQuery2.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<AuctionPrice> expand(PCollection<Event> allEvents) {
  return allEvents
      .apply(Filter.by(NexmarkQueryUtil.IS_BID))
      .apply(getName() + ".SelectEvent", new SelectEvent(Type.BID))
      .apply(
          SqlTransform.query(String.format(QUERY_TEMPLATE, skipFactor))
              .withQueryPlannerClass(plannerClass))
      .apply(Convert.fromRows(AuctionPrice.class));
}
 
Example #16
Source File: IpPrivacyDecoder.java    From gcp-ingestion with Mozilla Public License 2.0 5 votes vote down vote up
/**
 * Execute an Apache Beam pipeline and return the {@code PipelineResult}.
 */
public static PipelineResult run(IpPrivacyDecoderOptions.Parsed options) {
  final Pipeline pipeline = Pipeline.create(options);
  final List<PCollection<PubsubMessage>> errorCollections = new ArrayList<>();

  // We wrap pipeline in Optional for more convenience in chaining together transforms.
  Optional.of(pipeline) //
      .map(p -> p //
          .apply(options.getInputType().read(options)) //
          .apply(ParseUri.of()).failuresTo(errorCollections) //
          .apply("RestrictToMainPings",
              Filter
                  .by((message) -> "main".equals(message.getAttribute(Attribute.DOCUMENT_TYPE))))
          .apply(ParseProxy.of()) //
          .apply(ParseIp.of()) //
          .apply(GeoCityLookup.of(options.getGeoCityDatabase(), options.getGeoCityFilter())) //
          .apply(DecompressPayload.enabled(options.getDecompressInputPayloads())) //
          .apply(ExtractClientIdAndDropPayload.of()).failuresTo(errorCollections) //
          .apply(HashClientInfo.of(options.getClientIdHashKey(), options.getClientIpHashKey())) //
          .apply(NormalizeAttributes.of())) //
      .map(p -> p //
          .apply(RemoveAttributes.of()) //
          .apply(options.getOutputType().write(options)).failuresTo(errorCollections));

  // Write error output collections.
  PCollectionList.of(errorCollections) //
      .apply("FlattenErrorCollections", Flatten.pCollections()) //
      .apply("WriteErrorOutput", options.getErrorOutputType().write(options)) //
      .output();

  return pipeline.run();
}
 
Example #17
Source File: CdcPCollectionsFetchers.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
private static PCollection<Row> filterAndDecode(
    PCollection<PubsubMessage> input, final String tableName, Schema tableSchema) {
  return input
      .apply(
          String.format("Filter_%s", tableName),
          Filter.by(
              message ->
                  message.getAttribute("table") != null
                      && message.getAttribute("table").equals(tableName)))
      .apply(
          String.format("Extract payload_%s", tableName),
          MapElements.into(TypeDescriptor.of(byte[].class)).via(PubsubMessage::getPayload))
      .apply(String.format("Decode_%s", tableName), DecodeRows.withSchema(tableSchema));
}
 
Example #18
Source File: MergeStatementBuildingFnTest.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Test
public void testTablesBuiltInPipeline() {
  Pipeline p = Pipeline.create();

  PCollection<KV<String, KV<Schema, Schema>>> tableSchemaS =
      p.apply(Create.of(
          KV.of(TABLE_1_NAME, KV.of(TABLE_1_PK_SCHEMA, TABLE_1_SCHEMA)),
          KV.of(TABLE_2_NAME, KV.of(TABLE_2_PK_SCHEMA, TABLE_2_SCHEMA)),
          KV.of(TABLE_1_NAME, KV.of(TABLE_1_PK_SCHEMA, TABLE_1_SCHEMA))));

  PCollection<KV<String, BigQueryAction>> statementsIssued =
  tableSchemaS
      .apply(ParDo.of(
          new MergeStatementBuildingFn(CHANGELOG_DATASET_ID, REPLICA_DATASET_ID, PROJECT_ID)));

  PCollection<KV<String, Long>>  tablesCreatedCount = statementsIssued
      .apply("GetCreateActions",
          Filter.by(input -> input.getValue().action.equals(BigQueryAction.CREATE_TABLE)))
      .apply("CountCreateActions", Count.perKey());

  PCollection<KV<String, Long>>  tablesMerged = statementsIssued
      .apply("GetMergeActions",
          Filter.by(input -> input.getValue().action.equals(BigQueryAction.STATEMENT)))
      .apply("CountMergeActions", Count.perKey());

  PAssert.that(tablesCreatedCount)
      .containsInAnyOrder(
          KV.of(TABLE_1_NAME, 1L),
          KV.of(TABLE_2_NAME, 1L));

  PAssert.that(tablesMerged)
      .containsInAnyOrder(
          KV.of(TABLE_1_NAME, 2L),
          KV.of(TABLE_2_NAME, 1L));

  p.run().waitUntilFinish();
}
 
Example #19
Source File: DynamicJdbcIO.java    From DataflowTemplates with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<T> expand(PCollection<T> input) {
  // See https://issues.apache.org/jira/browse/BEAM-2803
  // We use a combined approach to "break fusion" here:
  // (see https://cloud.google.com/dataflow/service/dataflow-service-desc#preventing-fusion)
  // 1) force the data to be materialized by passing it as a side input to an identity fn,
  // then 2) reshuffle it with a random key. Initial materialization provides some parallelism
  // and ensures that data to be shuffled can be generated in parallel, while reshuffling
  // provides perfect parallelism.
  // In most cases where a "fusion break" is needed, a simple reshuffle would be sufficient.
  // The current approach is necessary only to support the particular case of JdbcIO where
  // a single query may produce many gigabytes of query results.
  PCollectionView<Iterable<T>> empty =
      input
          .apply("Consume", Filter.by(SerializableFunctions.constant(false)))
          .apply(View.asIterable());
  PCollection<T> materialized =
      input.apply(
          "Identity",
          ParDo.of(
                  new DoFn<T, T>() {
                    @ProcessElement
                    public void process(ProcessContext c) {
                      c.output(c.element());
                    }
                  })
              .withSideInputs(empty));
  return materialized.apply(Reshuffle.viaRandomKey());
}
 
Example #20
Source File: AutoComplete.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollectionList<KV<String, List<CompletionCandidate>>> expand(
    PCollection<CompletionCandidate> input) {
  if (minPrefix > 10) {
    // Base case, partitioning to return the output in the expected format.
    return input
        .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix))
        .apply(Partition.of(2, new KeySizePartitionFn()));
  } else {
    // If a candidate is in the top N for prefix a...b, it must also be in the top
    // N for a...bX for every X, which is typlically a much smaller set to consider.
    // First, compute the top candidate for prefixes of size at least minPrefix + 1.
    PCollectionList<KV<String, List<CompletionCandidate>>> larger =
        input.apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1));
    // Consider the top candidates for each prefix of length minPrefix + 1...
    PCollection<KV<String, List<CompletionCandidate>>> small =
        PCollectionList.of(larger.get(1).apply(ParDo.of(new FlattenTops())))
            // ...together with those (previously excluded) candidates of length
            // exactly minPrefix...
            .and(input.apply(Filter.by(c -> c.getValue().length() == minPrefix)))
            .apply("FlattenSmall", Flatten.pCollections())
            // ...set the key to be the minPrefix-length prefix...
            .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix)))
            // ...and (re)apply the Top operator to all of them together.
            .apply(Top.largestPerKey(candidatesPerPrefix));

    PCollection<KV<String, List<CompletionCandidate>>> flattenLarger =
        larger.apply("FlattenLarge", Flatten.pCollections());

    return PCollectionList.of(flattenLarger).and(small);
  }
}
 
Example #21
Source File: AutoCompleteTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testAutoComplete() {
  List<String> words =
      Arrays.asList(
          "apple",
          "apple",
          "apricot",
          "banana",
          "blackberry",
          "blackberry",
          "blackberry",
          "blueberry",
          "blueberry",
          "cherry");

  PCollection<String> input = p.apply(Create.of(words));

  PCollection<KV<String, List<CompletionCandidate>>> output =
      input
          .apply(new ComputeTopCompletions(2, recursive))
          .apply(Filter.by(element -> element.getKey().length() <= 2));

  PAssert.that(output)
      .containsInAnyOrder(
          KV.of("a", parseList("apple:2", "apricot:1")),
          KV.of("ap", parseList("apple:2", "apricot:1")),
          KV.of("b", parseList("blackberry:3", "blueberry:2")),
          KV.of("ba", parseList("banana:1")),
          KV.of("bl", parseList("blackberry:3", "blueberry:2")),
          KV.of("c", parseList("cherry:1")),
          KV.of("ch", parseList("cherry:1")));
  p.run().waitUntilFinish();
}
 
Example #22
Source File: SqlQuery1.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public PCollection<Bid> expand(PCollection<Event> allEvents) {
  return allEvents
      .apply(Filter.by(NexmarkQueryUtil.IS_BID))
      .apply(getName() + ".SelectEvent", new SelectEvent(Type.BID))
      .apply(QUERY)
      .apply(Convert.fromRows(Bid.class));
}
 
Example #23
Source File: Query9.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<AuctionBid> expand(PCollection<Event> events) {
  return events.apply(Filter.by(new AuctionOrBid())).apply(new WinningBids(name, configuration));
}
 
Example #24
Source File: NexmarkQueryUtil.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<Auction> expand(PCollection<Event> input) {
  return input
      .apply("IsNewAuction", Filter.by(IS_NEW_AUCTION))
      .apply("AsAuction", ParDo.of(AS_AUCTION));
}
 
Example #25
Source File: NexmarkQueryUtil.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<Person> expand(PCollection<Event> input) {
  return input
      .apply("IsNewPerson", Filter.by(IS_NEW_PERSON))
      .apply("AsPerson", ParDo.of(AS_PERSON));
}
 
Example #26
Source File: NexmarkQueryUtil.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PCollection<Bid> expand(PCollection<Event> input) {
  return input.apply("IsBid", Filter.by(IS_BID)).apply("AsBid", ParDo.of(AS_BID));
}
 
Example #27
Source File: MinimalWordCount.java    From beam with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) {

    // Create a PipelineOptions object. This object lets us set various execution
    // options for our pipeline, such as the runner you wish to use. This example
    // will run with the DirectRunner by default, based on the class path configured
    // in its dependencies.
    PipelineOptions options = PipelineOptionsFactory.create();

    // In order to run your pipeline, you need to make following runner specific changes:
    //
    // CHANGE 1/3: Select a Beam runner, such as BlockingDataflowRunner
    // or FlinkRunner.
    // CHANGE 2/3: Specify runner-required options.
    // For BlockingDataflowRunner, set project and temp location as follows:
    //   DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
    //   dataflowOptions.setRunner(BlockingDataflowRunner.class);
    //   dataflowOptions.setProject("SET_YOUR_PROJECT_ID_HERE");
    //   dataflowOptions.setTempLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_TEMP_DIRECTORY");
    // For FlinkRunner, set the runner as follows. See {@code FlinkPipelineOptions}
    // for more details.
    //   options.as(FlinkPipelineOptions.class)
    //      .setRunner(FlinkRunner.class);

    // Create the Pipeline object with the options we defined above
    Pipeline p = Pipeline.create(options);

    // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set
    // of input text files. TextIO.Read returns a PCollection where each element is one line from
    // the input text (a set of Shakespeare's texts).

    // This example reads a public data set consisting of the complete works of Shakespeare.
    p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*"))

        // Concept #2: Apply a FlatMapElements transform the PCollection of text lines.
        // This transform splits the lines in PCollection<String>, where each element is an
        // individual word in Shakespeare's collected texts.
        .apply(
            FlatMapElements.into(TypeDescriptors.strings())
                .via((String line) -> Arrays.asList(line.split("[^\\p{L}]+"))))
        // We use a Filter transform to avoid empty word
        .apply(Filter.by((String word) -> !word.isEmpty()))
        // Concept #3: Apply the Count transform to our PCollection of individual words. The Count
        // transform returns a new PCollection of key/value pairs, where each key represents a
        // unique word in the text. The associated value is the occurrence count for that word.
        .apply(Count.perElement())
        // Apply a MapElements transform that formats our PCollection of word counts into a
        // printable string, suitable for writing to an output file.
        .apply(
            MapElements.into(TypeDescriptors.strings())
                .via(
                    (KV<String, Long> wordCount) ->
                        wordCount.getKey() + ": " + wordCount.getValue()))
        // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline.
        // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of
        // formatted strings) to a series of text files.
        //
        // By default, it will write to a set of files with names like wordcounts-00001-of-00005
        .apply(TextIO.write().to("wordcounts"));

    p.run().waitUntilFinish();
  }
 
Example #28
Source File: Task.java    From beam with Apache License 2.0 4 votes vote down vote up
static PCollection<Integer> applyTransform(PCollection<Integer> input) {
  return input.apply(Filter.by(number -> number % 2 == 0));
}
 
Example #29
Source File: TestExpansionService.java    From beam with Apache License 2.0 4 votes vote down vote up
@Override
public PTransform<PCollection<String>, PCollection<String>> buildExternal(
    StringConfiguration configuration) {
  return Filter.lessThanEq(configuration.data);
}
 
Example #30
Source File: MinimalWordCount.java    From deployment-examples with MIT License 4 votes vote down vote up
public static void main(String[] args) {

    // Create a PipelineOptions object. This object lets us set various execution
    // options for our pipeline, such as the runner you wish to use. This example
    // will run with the DirectRunner by default, based on the class path configured
    // in its dependencies.
    PipelineOptions options = PipelineOptionsFactory.create();

    // In order to run your pipeline, you need to make following runner specific changes:
    //
    // CHANGE 1/3: Select a Beam runner, such as BlockingDataflowRunner
    // or FlinkRunner.
    // CHANGE 2/3: Specify runner-required options.
    // For BlockingDataflowRunner, set project and temp location as follows:
    //   DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
    //   dataflowOptions.setRunner(BlockingDataflowRunner.class);
    //   dataflowOptions.setProject("SET_YOUR_PROJECT_ID_HERE");
    //   dataflowOptions.setTempLocation("gs://SET_YOUR_BUCKET_NAME_HERE/AND_TEMP_DIRECTORY");
    // For FlinkRunner, set the runner as follows. See {@code FlinkPipelineOptions}
    // for more details.
    //   options.as(FlinkPipelineOptions.class)
    //      .setRunner(FlinkRunner.class);

    // Create the Pipeline object with the options we defined above
    Pipeline p = Pipeline.create(options);

    // Concept #1: Apply a root transform to the pipeline; in this case, TextIO.Read to read a set
    // of input text files. TextIO.Read returns a PCollection where each element is one line from
    // the input text (a set of Shakespeare's texts).

    // This example reads a public data set consisting of the complete works of Shakespeare.
    p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*"))

        // Concept #2: Apply a FlatMapElements transform the PCollection of text lines.
        // This transform splits the lines in PCollection<String>, where each element is an
        // individual word in Shakespeare's collected texts.
        .apply(
            FlatMapElements.into(TypeDescriptors.strings())
                .via((String line) -> Arrays.asList(line.split("[^\\p{L}]+"))))
        // We use a Filter transform to avoid empty word
        .apply(Filter.by((String word) -> !word.isEmpty()))
        // Concept #3: Apply the Count transform to our PCollection of individual words. The Count
        // transform returns a new PCollection of key/value pairs, where each key represents a
        // unique word in the text. The associated value is the occurrence count for that word.
        .apply(Count.perElement())
        // Apply a MapElements transform that formats our PCollection of word counts into a
        // printable string, suitable for writing to an output file.
        .apply(
            MapElements.into(TypeDescriptors.strings())
                .via(
                    (KV<String, Long> wordCount) ->
                        wordCount.getKey() + ": " + wordCount.getValue()))
        // Concept #4: Apply a write transform, TextIO.Write, at the end of the pipeline.
        // TextIO.Write writes the contents of a PCollection (in this case, our PCollection of
        // formatted strings) to a series of text files.
        //
        // By default, it will write to a set of files with names like wordcounts-00001-of-00005
        .apply(TextIO.write().to("wordcounts"));

    p.run().waitUntilFinish();
  }