Java Code Examples for org.apache.beam.sdk.io.BoundedSource#split()

The following examples show how to use org.apache.beam.sdk.io.BoundedSource#split() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: SourceTestUtilsTest.java From beam with Apache License 2.0

6 votes

@Test
public void testToUnsplittableSource() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  BoundedSource<Long> baseSource = CountingSource.upTo(100);
  BoundedSource<Long> unsplittableSource = SourceTestUtils.toUnsplittableSource(baseSource);
  List<?> splits = unsplittableSource.split(1, options);
  assertEquals(1, splits.size());
  assertEquals(unsplittableSource, splits.get(0));

  BoundedReader<Long> unsplittableReader = unsplittableSource.createReader(options);
  assertEquals(0, unsplittableReader.getFractionConsumed(), 1e-15);

  Set<Long> expected = Sets.newHashSet(SourceTestUtils.readFromSource(baseSource, options));
  Set<Long> actual = Sets.newHashSet();
  actual.addAll(SourceTestUtils.readNItemsFromUnstartedReader(unsplittableReader, 40));
  assertNull(unsplittableReader.splitAtFraction(0.5));
  actual.addAll(SourceTestUtils.readRemainingFromReader(unsplittableReader, true /* started */));
  assertEquals(1, unsplittableReader.getFractionConsumed(), 1e-15);

  assertEquals(100, actual.size());
  assertEquals(Sets.newHashSet(expected), Sets.newHashSet(actual));
}

Example 2

Source File: BoundedSourceSystem.java From beam with Apache License 2.0

6 votes

private static <T> List<BoundedSource<T>> split(
    BoundedSource<T> source, SamzaPipelineOptions pipelineOptions) throws Exception {
  final int numSplits = pipelineOptions.getMaxSourceParallelism();
  if (numSplits > 1) {
    final long estimatedSize = source.getEstimatedSizeBytes(pipelineOptions);
    // calculate the size of each split, rounded up to the ceiling.
    final long bundleSize = (estimatedSize + numSplits - 1) / numSplits;
    @SuppressWarnings("unchecked")
    final List<BoundedSource<T>> splits =
        (List<BoundedSource<T>>) source.split(bundleSize, pipelineOptions);
    // Need the empty check here because Samza doesn't handle empty partition well
    if (!splits.isEmpty()) {
      return splits;
    }
  }
  return Collections.singletonList(source);
}

Example 3

Source File: BoundedReadEvaluatorFactory.java From beam with Apache License 2.0

6 votes

@Override
public Collection<CommittedBundle<BoundedSourceShard<T>>> getInitialInputs(
    AppliedPTransform<PBegin, PCollection<T>, PTransform<PBegin, PCollection<T>>> transform,
    int targetParallelism)
    throws Exception {
  BoundedSource<T> source = ReadTranslation.boundedSourceFromTransform(transform);
  long estimatedBytes = source.getEstimatedSizeBytes(options);
  long bytesPerBundle = estimatedBytes / targetParallelism;
  List<? extends BoundedSource<T>> bundles = source.split(bytesPerBundle, options);
  ImmutableList.Builder<CommittedBundle<BoundedSourceShard<T>>> shards =
      ImmutableList.builder();
  for (BoundedSource<T> bundle : bundles) {
    CommittedBundle<BoundedSourceShard<T>> inputShard =
        evaluationContext
            .<BoundedSourceShard<T>>createRootBundle()
            .add(WindowedValue.valueInGlobalWindow(BoundedSourceShard.of(bundle)))
            .commit(BoundedWindow.TIMESTAMP_MAX_VALUE);
    shards.add(inputShard);
  }
  return shards.build();
}

Example 4

Source File: SourceTestUtils.java From beam with Apache License 2.0

5 votes

public static <T> List<T> readFromSplitsOfSource(
    BoundedSource<T> source, long desiredBundleSizeBytes, PipelineOptions options)
    throws Exception {
  List<T> res = Lists.newArrayList();
  for (BoundedSource<T> split : source.split(desiredBundleSizeBytes, options)) {
    res.addAll(readFromSource(split, options));
  }
  return res;
}

Example 5

Source File: BigQueryIOReadTest.java From beam with Apache License 2.0

4 votes

@Test
public void testBigQueryTableSourceInitSplit() throws Exception {
  List<TableRow> expected =
      ImmutableList.of(
          new TableRow().set("name", "a").set("number", 1L),
          new TableRow().set("name", "b").set("number", 2L),
          new TableRow().set("name", "c").set("number", 3L),
          new TableRow().set("name", "d").set("number", 4L),
          new TableRow().set("name", "e").set("number", 5L),
          new TableRow().set("name", "f").set("number", 6L));

  TableReference table = BigQueryHelpers.parseTableSpec("project:data_set.table_name");
  fakeDatasetService.createDataset("project", "data_set", "", "", null);
  fakeDatasetService.createTable(
      new Table()
          .setTableReference(table)
          .setSchema(
              new TableSchema()
                  .setFields(
                      ImmutableList.of(
                          new TableFieldSchema().setName("name").setType("STRING"),
                          new TableFieldSchema().setName("number").setType("INTEGER")))));
  fakeDatasetService.insertAll(table, expected, null);

  String stepUuid = "testStepUuid";
  BoundedSource<TableRow> bqSource =
      BigQueryTableSourceDef.create(fakeBqServices, ValueProvider.StaticValueProvider.of(table))
          .toSource(stepUuid, TableRowJsonCoder.of(), BigQueryIO.TableRowParser.INSTANCE);

  PipelineOptions options = PipelineOptionsFactory.create();
  options.setTempLocation(testFolder.getRoot().getAbsolutePath());
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  bqOptions.setProject("project");

  List<TableRow> read =
      convertStringsToLong(
          SourceTestUtils.readFromSplitsOfSource(bqSource, 0L /* ignored */, options));
  assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));

  List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
  assertEquals(2, sources.size());
  // Simulate a repeated call to split(), like a Dataflow worker will sometimes do.
  sources = bqSource.split(200, options);
  assertEquals(2, sources.size());

  // A repeated call to split() should not have caused a duplicate extract job.
  assertEquals(1, fakeJobService.getNumExtractJobCalls());
}

Example 6

Source File: BigQueryIOReadTest.java From beam with Apache License 2.0

4 votes

@Test
public void testBigQueryQuerySourceInitSplit() throws Exception {

  PipelineOptions options = PipelineOptionsFactory.create();
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  bqOptions.setProject("project");

  TableReference sourceTableRef = BigQueryHelpers.parseTableSpec("project:dataset.table");

  fakeDatasetService.createDataset(
      sourceTableRef.getProjectId(),
      sourceTableRef.getDatasetId(),
      "asia-northeast1",
      "Fake plastic tree^H^H^H^Htables",
      null);

  fakeDatasetService.createTable(
      new Table().setTableReference(sourceTableRef).setLocation("asia-northeast1"));

  Table queryResultTable =
      new Table()
          .setSchema(
              new TableSchema()
                  .setFields(
                      ImmutableList.of(
                          new TableFieldSchema().setName("name").setType("STRING"),
                          new TableFieldSchema().setName("number").setType("INTEGER"))));

  List<TableRow> expected =
      ImmutableList.of(
          new TableRow().set("name", "a").set("number", 1L),
          new TableRow().set("name", "b").set("number", 2L),
          new TableRow().set("name", "c").set("number", 3L),
          new TableRow().set("name", "d").set("number", 4L),
          new TableRow().set("name", "e").set("number", 5L),
          new TableRow().set("name", "f").set("number", 6L));

  String encodedQuery = FakeBigQueryServices.encodeQueryResult(queryResultTable, expected);

  String stepUuid = "testStepUuid";

  TableReference tempTableReference =
      createTempTableReference(
          bqOptions.getProject(),
          createJobIdToken(options.getJobName(), stepUuid),
          Optional.empty());

  fakeJobService.expectDryRunQuery(
      bqOptions.getProject(),
      encodedQuery,
      new JobStatistics()
          .setQuery(
              new JobStatistics2()
                  .setTotalBytesProcessed(100L)
                  .setReferencedTables(ImmutableList.of(sourceTableRef, tempTableReference))));

  BoundedSource<TableRow> bqSource =
      BigQueryQuerySourceDef.create(
              fakeBqServices,
              ValueProvider.StaticValueProvider.of(encodedQuery),
              true /* flattenResults */,
              true /* useLegacySql */,
              QueryPriority.BATCH,
              null,
              null,
              null)
          .toSource(stepUuid, TableRowJsonCoder.of(), BigQueryIO.TableRowParser.INSTANCE);

  options.setTempLocation(testFolder.getRoot().getAbsolutePath());

  List<TableRow> read =
      convertStringsToLong(
          SourceTestUtils.readFromSplitsOfSource(bqSource, 0L /* ignored */, options));
  assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));

  List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
  assertEquals(2, sources.size());
}

Example 7

Source File: BigQueryIOReadTest.java From beam with Apache License 2.0

4 votes

/**
 * This test simulates the scenario where the SQL text which is executed by the query job doesn't
 * by itself refer to any tables (e.g. "SELECT 17 AS value"), and thus there are no referenced
 * tables when the dry run of the query is performed.
 */
@Test
public void testBigQueryQuerySourceInitSplit_NoReferencedTables() throws Exception {

  PipelineOptions options = PipelineOptionsFactory.create();
  BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
  bqOptions.setProject("project");

  Table queryResultTable =
      new Table()
          .setSchema(
              new TableSchema()
                  .setFields(
                      ImmutableList.of(
                          new TableFieldSchema().setName("name").setType("STRING"),
                          new TableFieldSchema().setName("number").setType("INTEGER"))));

  List<TableRow> expected =
      ImmutableList.of(
          new TableRow().set("name", "a").set("number", 1L),
          new TableRow().set("name", "b").set("number", 2L),
          new TableRow().set("name", "c").set("number", 3L),
          new TableRow().set("name", "d").set("number", 4L),
          new TableRow().set("name", "e").set("number", 5L),
          new TableRow().set("name", "f").set("number", 6L));

  String encodedQuery = FakeBigQueryServices.encodeQueryResult(queryResultTable, expected);

  String stepUuid = "testStepUuid";

  fakeJobService.expectDryRunQuery(
      bqOptions.getProject(),
      encodedQuery,
      new JobStatistics()
          .setQuery(
              new JobStatistics2()
                  .setTotalBytesProcessed(100L)
                  .setReferencedTables(ImmutableList.of())));

  BoundedSource<TableRow> bqSource =
      BigQueryQuerySourceDef.create(
              fakeBqServices,
              ValueProvider.StaticValueProvider.of(encodedQuery),
              true /* flattenResults */,
              true /* useLegacySql */,
              QueryPriority.BATCH,
              null,
              null,
              null)
          .toSource(stepUuid, TableRowJsonCoder.of(), BigQueryIO.TableRowParser.INSTANCE);

  options.setTempLocation(testFolder.getRoot().getAbsolutePath());

  List<TableRow> read =
      convertStringsToLong(
          SourceTestUtils.readFromSplitsOfSource(bqSource, 0L /* ignored */, options));
  assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));

  List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
  assertEquals(2, sources.size());
}

Example 8

Source File: XmlSourceTest.java From beam with Apache License 2.0

4 votes

@Test
public void testSplitAtFraction() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  String fileName = "temp.xml";
  List<Train> trains = generateRandomTrainList(100);
  File file = createRandomTrainXML(fileName, trains);

  BoundedSource<Train> fileSource =
      XmlIO.<Train>read()
          .from(file.toPath().toString())
          .withRootElement("trains")
          .withRecordElement("train")
          .withRecordClass(Train.class)
          .withMinBundleSize(10)
          .createSource();

  List<? extends BoundedSource<Train>> splits = fileSource.split(file.length() / 3, null);
  for (BoundedSource<Train> splitSource : splits) {
    int numItems = readEverythingFromReader(splitSource.createReader(null)).size();
    // Should not split while unstarted.
    assertSplitAtFractionFails(splitSource, 0, 0.7, options);
    assertSplitAtFractionSucceedsAndConsistent(splitSource, 1, 0.7, options);
    assertSplitAtFractionSucceedsAndConsistent(splitSource, 15, 0.7, options);
    assertSplitAtFractionFails(splitSource, 0, 0.0, options);
    assertSplitAtFractionFails(splitSource, 20, 0.3, options);
    assertSplitAtFractionFails(splitSource, numItems, 1.0, options);

    // After reading 100 elements we will be approximately at position
    // 0.99 * (endOffset - startOffset) hence trying to split at fraction 0.9 will be
    // unsuccessful.
    assertSplitAtFractionFails(splitSource, numItems, 0.9, options);

    // Following passes since we can always find a fraction that is extremely close to 1 such that
    // the position suggested by the fraction will be larger than the position the reader is at
    // after reading "items - 1" elements.
    // This also passes for "numItemsToReadBeforeSplit = items" if the position at suggested
    // fraction is larger than the position the reader is at after reading all "items" elements
    // (i.e., the start position of the last element). This is true for most cases but will not
    // be true if reader position is only one less than the end position. (i.e., the last element
    // of the bundle start at the last byte that belongs to the bundle).
    assertSplitAtFractionSucceedsAndConsistent(splitSource, numItems - 1, 0.999, options);
  }
}