Java Code Examples for org.apache.beam.sdk.io.UnboundedSource#split()

The following examples show how to use org.apache.beam.sdk.io.UnboundedSource#split() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: UnboundedSourceSystem.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * For better parallelism in Samza, we need to configure a large split number for {@link
 * UnboundedSource} like Kafka. This will most likely make each split contain a single partition,
 * and be assigned to a Samza task. A large split number is safe since the actual split is bounded
 * by the number of source partitions.
 */
private static <T, CheckpointMarkT extends CheckpointMark>
    List<UnboundedSource<T, CheckpointMarkT>> split(
        UnboundedSource<T, CheckpointMarkT> source, SamzaPipelineOptions pipelineOptions)
        throws Exception {
  final int numSplits = pipelineOptions.getMaxSourceParallelism();
  if (numSplits > 1) {
    @SuppressWarnings("unchecked")
    final List<UnboundedSource<T, CheckpointMarkT>> splits =
        (List<UnboundedSource<T, CheckpointMarkT>>) source.split(numSplits, pipelineOptions);
    // Need the empty check here because Samza doesn't handle empty partition well
    if (!splits.isEmpty()) {
      return splits;
    }
  }
  return Collections.singletonList(source);
}
 
Example 2
Source File: KafkaIOTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testUnboundedSourceSplits() throws Exception {

  int numElements = 1000;
  int numSplits = 10;

  // Coders must be specified explicitly here due to the way the transform
  // is used in the test.
  UnboundedSource<KafkaRecord<Integer, Long>, ?> initial =
      mkKafkaReadTransform(numElements, null)
          .withKeyDeserializerAndCoder(IntegerDeserializer.class, BigEndianIntegerCoder.of())
          .withValueDeserializerAndCoder(LongDeserializer.class, BigEndianLongCoder.of())
          .makeSource();

  List<? extends UnboundedSource<KafkaRecord<Integer, Long>, ?>> splits =
      initial.split(numSplits, p.getOptions());
  assertEquals("Expected exact splitting", numSplits, splits.size());

  long elementsPerSplit = numElements / numSplits;
  assertEquals("Expected even splits", numElements, elementsPerSplit * numSplits);
  PCollectionList<Long> pcollections = PCollectionList.empty(p);
  for (int i = 0; i < splits.size(); ++i) {
    pcollections =
        pcollections.and(
            p.apply("split" + i, Read.from(splits.get(i)).withMaxNumRecords(elementsPerSplit))
                .apply("Remove Metadata " + i, ParDo.of(new RemoveKafkaMetadata<>()))
                .apply("collection " + i, Values.create()));
  }
  PCollection<Long> input = pcollections.apply(Flatten.pCollections());

  addCountingAsserts(input, numElements);
  p.run();
}
 
Example 3
Source File: UnboundedReadFromBoundedSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testInvokesSplitWithDefaultNumSplitsTooLarge() throws Exception {
  UnboundedSource<Long, ?> unboundedCountingSource =
      new BoundedToUnboundedSourceAdapter<Long>(CountingSource.upTo(1));
  PipelineOptions options = PipelineOptionsFactory.create();
  List<?> splits = unboundedCountingSource.split(100, options);
  assertEquals(1, splits.size());
  assertNotEquals(splits.get(0), unboundedCountingSource);
}
 
Example 4
Source File: UnboundedReadFromBoundedSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testInvokingSplitProducesAtLeastOneSplit() throws Exception {
  UnboundedSource<Long, ?> unboundedCountingSource =
      new BoundedToUnboundedSourceAdapter<Long>(CountingSource.upTo(0));
  PipelineOptions options = PipelineOptionsFactory.create();
  List<?> splits = unboundedCountingSource.split(100, options);
  assertEquals(1, splits.size());
  assertNotEquals(splits.get(0), unboundedCountingSource);
}
 
Example 5
Source File: UnboundedSourceWrapper.java    From beam with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
public UnboundedSourceWrapper(
    String stepName,
    PipelineOptions pipelineOptions,
    UnboundedSource<OutputT, CheckpointMarkT> source,
    int parallelism)
    throws Exception {
  this.stepName = stepName;
  this.serializedOptions = new SerializablePipelineOptions(pipelineOptions);
  this.isConvertedBoundedSource =
      source instanceof UnboundedReadFromBoundedSource.BoundedToUnboundedSourceAdapter;

  if (source.requiresDeduping()) {
    LOG.warn("Source {} requires deduping but Flink runner doesn't support this yet.", source);
  }

  Coder<CheckpointMarkT> checkpointMarkCoder = source.getCheckpointMarkCoder();
  if (checkpointMarkCoder == null) {
    LOG.info("No CheckpointMarkCoder specified for this source. Won't create snapshots.");
    checkpointCoder = null;
  } else {

    Coder<? extends UnboundedSource<OutputT, CheckpointMarkT>> sourceCoder =
        (Coder) SerializableCoder.of(new TypeDescriptor<UnboundedSource>() {});

    checkpointCoder = KvCoder.of(sourceCoder, checkpointMarkCoder);
  }

  // get the splits early. we assume that the generated splits are stable,
  // this is necessary so that the mapping of state to source is correct
  // when restoring
  splitSources = source.split(parallelism, pipelineOptions);

  FlinkPipelineOptions options = pipelineOptions.as(FlinkPipelineOptions.class);
  idleTimeoutMs = options.getShutdownSourcesAfterIdleMs();
}
 
Example 6
Source File: CustomSources.java    From beam with Apache License 2.0 5 votes vote down vote up
public static com.google.api.services.dataflow.model.Source serializeToCloudSource(
    Source<?> source, PipelineOptions options) throws Exception {
  com.google.api.services.dataflow.model.Source cloudSource =
      new com.google.api.services.dataflow.model.Source();
  // We ourselves act as the SourceFormat.
  cloudSource.setSpec(CloudObject.forClass(CustomSources.class));
  addString(
      cloudSource.getSpec(), SERIALIZED_SOURCE, encodeBase64String(serializeToByteArray(source)));

  SourceMetadata metadata = new SourceMetadata();
  if (source instanceof BoundedSource) {
    BoundedSource<?> boundedSource = (BoundedSource<?>) source;

    // Size estimation is best effort so we continue even if it fails here.
    try {
      metadata.setEstimatedSizeBytes(boundedSource.getEstimatedSizeBytes(options));
    } catch (Exception e) {
      LOG.warn("Size estimation of the source failed: " + source, e);
    }
  } else if (source instanceof UnboundedSource) {
    UnboundedSource<?, ?> unboundedSource = (UnboundedSource<?, ?>) source;
    metadata.setInfinite(true);
    List<String> encodedSplits = new ArrayList<>();
    int desiredNumSplits =
        getDesiredNumUnboundedSourceSplits(options.as(DataflowPipelineOptions.class));
    for (UnboundedSource<?, ?> split : unboundedSource.split(desiredNumSplits, options)) {
      encodedSplits.add(encodeBase64String(serializeToByteArray(split)));
    }
    checkArgument(!encodedSplits.isEmpty(), "UnboundedSources must have at least one split");
    addStringList(cloudSource.getSpec(), SERIALIZED_SOURCE_SPLITS, encodedSplits);
  } else {
    throw new IllegalArgumentException("Unexpected source kind: " + source.getClass());
  }

  cloudSource.setMetadata(metadata);
  return cloudSource;
}