Java Code Examples for org.apache.beam.sdk.io.BoundedSource#createReader()

The following examples show how to use org.apache.beam.sdk.io.BoundedSource#createReader() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SourceTestUtilsTest.java    From beam with Apache License 2.0 6 votes vote down vote up
@Test
public void testToUnsplittableSource() throws Exception {
  PipelineOptions options = PipelineOptionsFactory.create();
  BoundedSource<Long> baseSource = CountingSource.upTo(100);
  BoundedSource<Long> unsplittableSource = SourceTestUtils.toUnsplittableSource(baseSource);
  List<?> splits = unsplittableSource.split(1, options);
  assertEquals(1, splits.size());
  assertEquals(unsplittableSource, splits.get(0));

  BoundedReader<Long> unsplittableReader = unsplittableSource.createReader(options);
  assertEquals(0, unsplittableReader.getFractionConsumed(), 1e-15);

  Set<Long> expected = Sets.newHashSet(SourceTestUtils.readFromSource(baseSource, options));
  Set<Long> actual = Sets.newHashSet();
  actual.addAll(SourceTestUtils.readNItemsFromUnstartedReader(unsplittableReader, 40));
  assertNull(unsplittableReader.splitAtFraction(0.5));
  actual.addAll(SourceTestUtils.readRemainingFromReader(unsplittableReader, true /* started */));
  assertEquals(1, unsplittableReader.getFractionConsumed(), 1e-15);

  assertEquals(100, actual.size());
  assertEquals(Sets.newHashSet(expected), Sets.newHashSet(actual));
}
 
Example 2
Source File: HadoopFormatIOReadTest.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * This test verifies that the method {@link
 * HadoopInputFormatBoundedSource.HadoopInputFormatReader#getCurrentSource() getCurrentSource()}
 * returns correct source object.
 */
@Test
public void testGetCurrentSourceFunction() throws Exception {
  SerializableSplit split = new SerializableSplit();
  BoundedSource<KV<Text, Employee>> source =
      new HadoopInputFormatBoundedSource<>(
          serConf,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class),
          null, // No key translation required.
          null, // No value translation required.
          split);
  BoundedReader<KV<Text, Employee>> hifReader = source.createReader(p.getOptions());
  BoundedSource<KV<Text, Employee>> hifSource = hifReader.getCurrentSource();
  assertEquals(hifSource, source);
}
 
Example 3
Source File: BoundedSourceP.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * Called when currentReader is null or drained. At the end it will contain a started reader of
 * the next shard or null.
 */
private void nextShard() throws IOException {
  for (; ; ) {
    if (currentReader != null) {
      currentReader.close();
      currentReader = null;
    }
    BoundedSource<T> shard = shardsTraverser.next();
    if (shard == null) {
      break; // all shards done
    }
    currentReader = shard.createReader(options);
    if (currentReader.start()) {
      break;
    }
  }
}
 
Example 4
Source File: SourceTestUtils.java    From beam with Apache License 2.0 5 votes vote down vote up
/** Reads all elements from the given {@link BoundedSource}. */
public static <T> List<T> readFromSource(BoundedSource<T> source, PipelineOptions options)
    throws IOException {
  try (BoundedSource.BoundedReader<T> reader = source.createReader(options)) {
    return readFromUnstartedReader(reader);
  }
}
 
Example 5
Source File: XmlSourceTest.java    From beam with Apache License 2.0 5 votes vote down vote up
@Test
public void testReadXMLInvalidRecordClassWithCustomEventHandler() throws IOException {
  File file = tempFolder.newFile("trainXMLSmall");
  Files.write(file.toPath(), trainXML.getBytes(StandardCharsets.UTF_8));

  ValidationEventHandler validationEventHandler =
      event -> {
        throw new RuntimeException("MyCustomValidationEventHandler failure mesage");
      };

  BoundedSource<WrongTrainType> source =
      XmlIO.<WrongTrainType>read()
          .from(file.toPath().toString())
          .withRootElement("trains")
          .withRecordElement("train")
          .withRecordClass(WrongTrainType.class)
          .withValidationEventHandler(validationEventHandler)
          .createSource();

  exception.expect(RuntimeException.class);

  // JAXB internationalizes the error message. So this is all we can match for.
  exception.expectMessage("MyCustomValidationEventHandler failure mesage");
  try (Reader<WrongTrainType> reader = source.createReader(null)) {

    List<WrongTrainType> results = new ArrayList<>();
    for (boolean available = reader.start(); available; available = reader.advance()) {
      WrongTrainType train = reader.getCurrent();
      results.add(train);
    }
  }
}
 
Example 6
Source File: DatasetSourceBatch.java    From beam with Apache License 2.0 5 votes vote down vote up
DatasetPartitionReader(
    BoundedSource<T> source, SerializablePipelineOptions serializablePipelineOptions) {
  this.started = false;
  this.closed = false;
  this.source = source;
  // reader is not serializable so lazy initialize it
  try {
    reader = source.createReader(serializablePipelineOptions.get().as(PipelineOptions.class));
  } catch (IOException e) {
    throw new RuntimeException("Error creating BoundedReader ", e);
  }
}
 
Example 7
Source File: BoundedReadEvaluatorFactory.java    From beam with Apache License 2.0 5 votes vote down vote up
@Override
public void processElement(WindowedValue<BoundedSourceShard<OutputT>> element)
    throws Exception {
  BoundedSource<OutputT> source = element.getValue().getSource();
  try (final BoundedReader<OutputT> reader = source.createReader(options)) {
    boolean contentsRemaining = reader.start();
    Future<BoundedSource<OutputT>> residualFuture = startDynamicSplitThread(source, reader);
    UncommittedBundle<OutputT> output = evaluationContext.createBundle(outputPCollection);
    while (contentsRemaining) {
      output.add(
          WindowedValue.timestampedValueInGlobalWindow(
              reader.getCurrent(), reader.getCurrentTimestamp()));
      contentsRemaining = reader.advance();
    }
    resultBuilder.addOutput(output);
    try {
      BoundedSource<OutputT> residual = residualFuture.get();
      if (residual != null) {
        resultBuilder.addUnprocessedElements(
            element.withValue(BoundedSourceShard.of(residual)));
      }
    } catch (ExecutionException exex) {
      // Un-and-rewrap the exception thrown by attempting to split
      throw UserCodeException.wrap(exex.getCause());
    }
  }
}
 
Example 8
Source File: SourceTestUtils.java    From beam with Apache License 2.0 4 votes vote down vote up
private static <T> SourceTestUtils.SplitAtFractionResult assertSplitAtFractionBehaviorImpl(
    BoundedSource<T> source,
    List<T> expectedItems,
    int numItemsToReadBeforeSplit,
    double splitFraction,
    ExpectedSplitOutcome expectedOutcome,
    PipelineOptions options)
    throws Exception {
  try (BoundedSource.BoundedReader<T> reader = source.createReader(options)) {
    BoundedSource<T> originalSource = reader.getCurrentSource();
    List<T> currentItems = readNItemsFromUnstartedReader(reader, numItemsToReadBeforeSplit);
    BoundedSource<T> residual = reader.splitAtFraction(splitFraction);
    if (residual != null) {
      assertFalse(
          String.format(
              "Primary source didn't change after a successful split of %s at %f "
                  + "after reading %d items. "
                  + "Was the source object mutated instead of creating a new one? "
                  + "Source objects MUST be immutable.",
              source, splitFraction, numItemsToReadBeforeSplit),
          reader.getCurrentSource() == originalSource);
      assertFalse(
          String.format(
              "Residual source equal to original source after a successful split of %s at %f "
                  + "after reading %d items. "
                  + "Was the source object mutated instead of creating a new one? "
                  + "Source objects MUST be immutable.",
              source, splitFraction, numItemsToReadBeforeSplit),
          reader.getCurrentSource() == residual);
    }
    // Failure cases are: must succeed but fails; must fail but succeeds.
    switch (expectedOutcome) {
      case MUST_SUCCEED_AND_BE_CONSISTENT:
        assertNotNull(
            "Failed to split reader of source: "
                + source
                + " at "
                + splitFraction
                + " after reading "
                + numItemsToReadBeforeSplit
                + " items",
            residual);
        break;
      case MUST_FAIL:
        assertEquals(null, residual);
        break;
      case MUST_BE_CONSISTENT_IF_SUCCEEDS:
        // Nothing.
        break;
    }
    currentItems.addAll(readRemainingFromReader(reader, numItemsToReadBeforeSplit > 0));
    BoundedSource<T> primary = reader.getCurrentSource();
    return verifySingleSplitAtFractionResult(
        source,
        expectedItems,
        currentItems,
        primary,
        residual,
        numItemsToReadBeforeSplit,
        splitFraction,
        options);
  }
}
 
Example 9
Source File: SourceTestUtils.java    From beam with Apache License 2.0 4 votes vote down vote up
private static <T> boolean assertSplitAtFractionConcurrent(
    ExecutorService executor,
    BoundedSource<T> source,
    List<T> expectedItems,
    final int numItemsToReadBeforeSplitting,
    final double fraction,
    PipelineOptions options)
    throws Exception {
  @SuppressWarnings("resource") // Closed in readerThread
  final BoundedSource.BoundedReader<T> reader = source.createReader(options);
  final CountDownLatch unblockSplitter = new CountDownLatch(1);
  Future<List<T>> readerThread =
      executor.submit(
          () -> {
            try {
              List<T> items =
                  readNItemsFromUnstartedReader(reader, numItemsToReadBeforeSplitting);
              unblockSplitter.countDown();
              items.addAll(readRemainingFromReader(reader, numItemsToReadBeforeSplitting > 0));
              return items;
            } finally {
              reader.close();
            }
          });
  Future<KV<BoundedSource<T>, BoundedSource<T>>> splitterThread =
      executor.submit(
          () -> {
            unblockSplitter.await();
            BoundedSource<T> residual = reader.splitAtFraction(fraction);
            if (residual == null) {
              return null;
            }
            return KV.of(reader.getCurrentSource(), residual);
          });
  List<T> currentItems = readerThread.get();
  KV<BoundedSource<T>, BoundedSource<T>> splitSources = splitterThread.get();
  if (splitSources == null) {
    return false;
  }
  SplitAtFractionResult res =
      verifySingleSplitAtFractionResult(
          source,
          expectedItems,
          currentItems,
          splitSources.getKey(),
          splitSources.getValue(),
          numItemsToReadBeforeSplitting,
          fraction,
          options);
  return (res.numResidualItems > 0);
}
 
Example 10
Source File: BigQueryIOStorageReadTest.java    From beam with Apache License 2.0 4 votes vote down vote up
@Test
public void testStreamSourceSplitAtFractionSucceeds() throws Exception {
  Stream parentStream = Stream.newBuilder().setName("parent").build();

  List<ReadRowsResponse> parentResponses =
      Lists.newArrayList(
          createResponse(
              AVRO_SCHEMA,
              Lists.newArrayList(
                  createRecord("A", 1, AVRO_SCHEMA), createRecord("B", 2, AVRO_SCHEMA)),
              0.25),
          createResponse(
              AVRO_SCHEMA, Lists.newArrayList(createRecord("C", 3, AVRO_SCHEMA)), 0.50),
          createResponse(
              AVRO_SCHEMA,
              Lists.newArrayList(
                  createRecord("D", 4, AVRO_SCHEMA), createRecord("E", 5, AVRO_SCHEMA)),
              0.75));

  StorageClient fakeStorageClient = mock(StorageClient.class);
  when(fakeStorageClient.readRows(
          ReadRowsRequest.newBuilder()
              .setReadPosition(StreamPosition.newBuilder().setStream(parentStream))
              .build()))
      .thenReturn(new FakeBigQueryServerStream<>(parentResponses));

  // Mocks the split call.
  when(fakeStorageClient.splitReadStream(
          SplitReadStreamRequest.newBuilder()
              .setOriginalStream(parentStream)
              .setFraction(0.5f)
              .build()))
      .thenReturn(
          SplitReadStreamResponse.newBuilder()
              .setPrimaryStream(Stream.newBuilder().setName("primary"))
              .setRemainderStream(Stream.newBuilder().setName("residual"))
              .build());

  // Mocks the ReadRows calls expected on the primary and residual streams.
  when(fakeStorageClient.readRows(
          ReadRowsRequest.newBuilder()
              .setReadPosition(
                  StreamPosition.newBuilder()
                      .setStream(Stream.newBuilder().setName("primary"))
                      // This test will read rows 0 and 1 from the parent before calling split,
                      // so we expect the primary read to start at offset 2.
                      .setOffset(2))
              .build()))
      .thenReturn(new FakeBigQueryServerStream<>(parentResponses.subList(1, 2)));
  when(fakeStorageClient.readRows(
          ReadRowsRequest.newBuilder()
              .setReadPosition(
                  StreamPosition.newBuilder()
                      .setStream(Stream.newBuilder().setName("residual"))
                      .setOffset(0))
              .build()))
      .thenReturn(
          new FakeBigQueryServerStream<>(parentResponses.subList(2, parentResponses.size())));

  BigQueryStorageStreamSource<TableRow> streamSource =
      BigQueryStorageStreamSource.create(
          ReadSession.newBuilder()
              .setName("readSession")
              .setAvroSchema(AvroSchema.newBuilder().setSchema(AVRO_SCHEMA_STRING))
              .build(),
          parentStream,
          TABLE_SCHEMA,
          new TableRowParser(),
          TableRowJsonCoder.of(),
          new FakeBigQueryServices().withStorageClient(fakeStorageClient));

  // Read a few records from the parent stream and ensure that records are returned in the
  // prescribed order.
  BoundedReader<TableRow> parent = streamSource.createReader(options);
  assertTrue(parent.start());
  assertEquals("A", parent.getCurrent().get("name"));
  assertTrue(parent.advance());
  assertEquals("B", parent.getCurrent().get("name"));

  // Now split the stream, and ensure that the "parent" reader has been replaced with the
  // primary stream and that the returned source points to the residual stream.
  BoundedReader<TableRow> primary = parent;
  BoundedSource<TableRow> residualSource = parent.splitAtFraction(0.5);
  assertNotNull(residualSource);
  BoundedReader<TableRow> residual = residualSource.createReader(options);

  assertTrue(primary.advance());
  assertEquals("C", primary.getCurrent().get("name"));
  assertFalse(primary.advance());

  assertTrue(residual.start());
  assertEquals("D", residual.getCurrent().get("name"));
  assertTrue(residual.advance());
  assertEquals("E", residual.getCurrent().get("name"));
  assertFalse(residual.advance());
}
 
Example 11
Source File: HadoopFormatIOReadTest.java    From beam with Apache License 2.0 4 votes vote down vote up
/**
 * This test validates the method getFractionConsumed()- which indicates the progress of the read
 * in range of 0 to 1.
 */
@Test
public void testReadersGetFractionConsumed() throws Exception {
  List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData();
  HadoopInputFormatBoundedSource<Text, Employee> hifSource =
      getTestHIFSource(
          EmployeeInputFormat.class,
          Text.class,
          Employee.class,
          WritableCoder.of(Text.class),
          AvroCoder.of(Employee.class));
  long estimatedSize = hifSource.getEstimatedSizeBytes(p.getOptions());
  // Validate if estimated size is equal to the size of records.
  assertEquals(referenceRecords.size(), estimatedSize);
  List<BoundedSource<KV<Text, Employee>>> boundedSourceList = hifSource.split(0, p.getOptions());
  // Validate if split() has split correctly.
  assertEquals(TestEmployeeDataSet.NUMBER_OF_SPLITS, boundedSourceList.size());
  List<KV<Text, Employee>> bundleRecords = new ArrayList<>();
  for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) {
    List<KV<Text, Employee>> elements = new ArrayList<>();
    BoundedReader<KV<Text, Employee>> reader = source.createReader(p.getOptions());
    float recordsRead = 0;
    // When start is not called, getFractionConsumed() should return 0.
    assertEquals(Double.valueOf(0), reader.getFractionConsumed());
    boolean start = reader.start();
    assertTrue(start);
    if (start) {
      elements.add(reader.getCurrent());
      boolean advance = reader.advance();
      // Validate if getFractionConsumed() returns the correct fraction based on
      // the number of records read in the split.
      assertEquals(
          Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT),
          reader.getFractionConsumed());
      assertTrue(advance);
      while (advance) {
        elements.add(reader.getCurrent());
        advance = reader.advance();
        assertEquals(
            Double.valueOf(++recordsRead / TestEmployeeDataSet.NUMBER_OF_RECORDS_IN_EACH_SPLIT),
            reader.getFractionConsumed());
      }
      bundleRecords.addAll(elements);
    }
    // Validate if getFractionConsumed() returns 1 after reading is complete.
    assertEquals(Double.valueOf(1), reader.getFractionConsumed());
    reader.close();
  }
  assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray()));
}