org.apache.beam.sdk.transforms.Partition Java Examples

The following examples show how to use org.apache.beam.sdk.transforms.Partition. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: RepublishPerDocType.java From gcp-ingestion with Mozilla Public License 2.0

7 votes

@Override
public PDone expand(PCollection<PubsubMessage> input) {
  List<Destination> destinations = baseOptions.getPerDocTypeDestinations().entrySet().stream()
      .flatMap(
          entry -> entry.getValue().stream().map(value -> new Destination(entry.getKey(), value)))
      .collect(Collectors.toList());

  int numDestinations = destinations.size();
  int numPartitions = numDestinations + 1;
  PCollectionList<PubsubMessage> partitioned = input.apply("PartitionByDocType",
      Partition.of(numPartitions, new PartitionFn(destinations)));

  for (int i = 0; i < numDestinations; i++) {
    Destination destination = destinations.get(i);
    RepublisherOptions.Parsed opts = baseOptions.as(RepublisherOptions.Parsed.class);
    opts.setOutput(StaticValueProvider.of(destination.dest));
    String name = String.join("_", "republish", destination.namespace, destination.docType);
    partitioned.get(i).apply(name, opts.getOutputType().write(opts));
  }

  return PDone.in(input.getPipeline());
}

Example #2

Source File: RepublishPerNamespace.java From gcp-ingestion with Mozilla Public License 2.0

6 votes

@Override
public PDone expand(PCollection<PubsubMessage> input) {
  List<Destination> destinations = baseOptions.getPerNamespaceDestinations().entrySet().stream()
      .map(entry -> new Destination(entry.getKey(), entry.getValue()))
      .collect(Collectors.toList());
  int numDestinations = destinations.size();
  int numPartitions = numDestinations + 1;
  PCollectionList<PubsubMessage> partitioned = input.apply("PartitionByNamespace",
      Partition.of(numPartitions, new PartitionFn(destinations)));

  for (int i = 0; i < numDestinations; i++) {
    Destination destination = destinations.get(i);
    RepublisherOptions.Parsed opts = baseOptions.as(RepublisherOptions.Parsed.class);
    opts.setOutput(StaticValueProvider.of(destination.dest));
    String name = String.join("_", "republish", destination.namespace);
    partitioned.get(i).apply(name, opts.getOutputType().write(opts));
  }

  return PDone.in(input.getPipeline());
}

Example #3

Source File: RepublishPerChannel.java From gcp-ingestion with Mozilla Public License 2.0

5 votes

@Override
public PDone expand(PCollection<PubsubMessage> input) {
  List<Destination> destinations = baseOptions.getPerChannelSampleRatios().entrySet().stream() //
      .map(Destination::new) //
      .collect(Collectors.toList());
  int numDestinations = destinations.size();
  int numPartitions = numDestinations + 1;
  PCollectionList<PubsubMessage> partitioned = input.apply("PartitionByChannel",
      Partition.of(numPartitions, new PartitionFn(destinations)));

  for (int i = 0; i < numDestinations; i++) {
    Destination destination = destinations.get(i);
    RepublisherOptions.Parsed opts = baseOptions.as(RepublisherOptions.Parsed.class);

    // The destination pattern here must be compile-time due to a detail of Dataflow's
    // streaming PubSub producer implementation; if that restriction is lifted in the future,
    // this can become a runtime parameter and we can perform replacement via NestedValueProvider.
    opts.setOutput(StaticValueProvider
        .of(baseOptions.getPerChannelDestination().replace("${channel}", destination.channel)));

    partitioned.get(i) //
        .apply("Sample" + destination.getCapitalizedChannel() + "BySampleIdOrRandomNumber",
            Filter.by(message -> {
              message = PubsubConstraints.ensureNonNull(message);
              String sampleId = message.getAttribute("sample_id");
              return RandomSampler.filterBySampleIdOrRandomNumber(sampleId, destination.ratio);
            }))
        .apply("Republish" + destination.getCapitalizedChannel() + "Sample",
            opts.getOutputType().write(opts));
  }

  return PDone.in(input.getPipeline());
}

Example #4

Source File: BigQueryToTFRecord.java From DataflowTemplates with Apache License 2.0

5 votes

/**
 * The {@link BigQueryToTFRecord#applyTrainTestValSplit} method transforms the PCollection by
 * randomly partitioning it into PCollections for each dataset.
 */
static PCollectionList<byte[]> applyTrainTestValSplit(PCollection<byte[]> input,
    ValueProvider<Float> trainingPercentage,
    ValueProvider<Float> testingPercentage,
    ValueProvider<Float> validationPercentage,
    Random rand) {
  return input
      .apply(Partition.of(
          3,
          (Partition.PartitionFn<byte[]>) (number, numPartitions) -> {
            Float train = trainingPercentage.get();
            Float test = testingPercentage.get();
            Float validation = validationPercentage.get();
            Double d = rand.nextDouble();
            if (train + test + validation != 1) {
              throw new RuntimeException(String.format("Train %.2f, Test %.2f, Validation"
                  + " %.2f percentages must add up to 100 percent", train, test, validation));
            }
            if (d < train) {
              return 0;
            } else if (d >= train && d < train + test) {
              return 1;
            } else {
              return 2;
            }
          }));
}

Example #5

Source File: AutoComplete.java From beam with Apache License 2.0

5 votes

@Override
public PCollectionList<KV<String, List<CompletionCandidate>>> expand(
    PCollection<CompletionCandidate> input) {
  if (minPrefix > 10) {
    // Base case, partitioning to return the output in the expected format.
    return input
        .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix))
        .apply(Partition.of(2, new KeySizePartitionFn()));
  } else {
    // If a candidate is in the top N for prefix a...b, it must also be in the top
    // N for a...bX for every X, which is typlically a much smaller set to consider.
    // First, compute the top candidate for prefixes of size at least minPrefix + 1.
    PCollectionList<KV<String, List<CompletionCandidate>>> larger =
        input.apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1));
    // Consider the top candidates for each prefix of length minPrefix + 1...
    PCollection<KV<String, List<CompletionCandidate>>> small =
        PCollectionList.of(larger.get(1).apply(ParDo.of(new FlattenTops())))
            // ...together with those (previously excluded) candidates of length
            // exactly minPrefix...
            .and(input.apply(Filter.by(c -> c.getValue().length() == minPrefix)))
            .apply("FlattenSmall", Flatten.pCollections())
            // ...set the key to be the minPrefix-length prefix...
            .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix)))
            // ...and (re)apply the Top operator to all of them together.
            .apply(Top.largestPerKey(candidatesPerPrefix));

    PCollection<KV<String, List<CompletionCandidate>>> flattenLarger =
        larger.apply("FlattenLarge", Flatten.pCollections());

    return PCollectionList.of(flattenLarger).and(small);
  }
}

Example #6

Source File: Task.java From beam with Apache License 2.0

5 votes

static PCollectionList<Integer> applyTransform(PCollection<Integer> input) {
  return input
      .apply(Partition.of(2,
          (PartitionFn<Integer>) (number, numPartitions) -> {
            if (number > 100) {
              return 0;
            } else {
              return 1;
            }
          }));
}

Example #7

Source File: TestExpansionService.java From beam with Apache License 2.0

4 votes

@Override
public PTransform<PCollection<Long>, PCollectionList<Long>> buildExternal(
    EmptyConfiguration configuration) {
  return Partition.of(2, (Long elem, int numP) -> elem % 2 == 0 ? 0 : 1);
}