org.apache.beam.sdk.io.BoundedSource Java Examples
The following examples show how to use
org.apache.beam.sdk.io.BoundedSource.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HadoopFormatIOReadTest.java From beam with Apache License 2.0 | 6 votes |
/** * Test reading if InputFormat implements {@link org.apache.hadoop.conf.Configurable * Configurable}. */ @Test public void testReadingWithConfigurableInputFormat() throws Exception { List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList( ConfigurableEmployeeInputFormat.class, Text.class, Employee.class, WritableCoder.of(Text.class), AvroCoder.of(Employee.class)); for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) { // Cast to HadoopInputFormatBoundedSource to access getInputFormat(). HadoopInputFormatBoundedSource<Text, Employee> hifSource = (HadoopInputFormatBoundedSource<Text, Employee>) source; hifSource.createInputFormatInstance(); ConfigurableEmployeeInputFormat inputFormatObj = (ConfigurableEmployeeInputFormat) hifSource.getInputFormat(); assertTrue(inputFormatObj.isConfSet); } }
Example #2
Source File: AvroTableFileAsMutations.java From DataflowTemplates with Apache License 2.0 | 6 votes |
@ProcessElement public void processElement(ProcessContext c) { FileShard f = c.element(); Ddl ddl = c.sideInput(ddlView); Table table = ddl.table(f.getTableName()); SerializableFunction<GenericRecord, Mutation> parseFn = new AvroRecordConverter(table); AvroSource<Mutation> source = AvroSource.from(f.getFile().getMetadata().resourceId().toString()) .withParseFn(parseFn, SerializableCoder.of(Mutation.class)); try { BoundedSource.BoundedReader<Mutation> reader = source .createForSubrangeOfFile( f.getFile().getMetadata(), f.getRange().getFrom(), f.getRange().getTo()) .createReader(c.getPipelineOptions()); for (boolean more = reader.start(); more; more = reader.advance()) { c.output(reader.getCurrent()); } } catch (IOException e) { throw new RuntimeException(e); } }
Example #3
Source File: TextSourceTest.java From DataflowTemplates with Apache License 2.0 | 6 votes |
@ProcessElement public void processElement(ProcessContext c) { ReadableFile file = c.element(); // Create a TextSource, passing null as the delimiter to use the default // delimiters ('\n', '\r', or '\r\n'). TextSource textSource = new TextSource(file.getMetadata(), 0, file.getMetadata().sizeBytes(), null); String line; try { BoundedSource.BoundedReader<String> reader = textSource .createForSubrangeOfFile(file.getMetadata(), 0, file.getMetadata().sizeBytes()) .createReader(c.getPipelineOptions()); for (boolean more = reader.start(); more; more = reader.advance()) { c.output(reader.getCurrent()); } } catch (IOException e) { throw new RuntimeException( "Unable to readFile: " + file.getMetadata().resourceId().toString()); } }
Example #4
Source File: SourceTestUtils.java From beam with Apache License 2.0 | 6 votes |
/** * Asserts that the {@code source}'s reader either fails to {@code splitAtFraction(fraction)} * after reading {@code numItemsToReadBeforeSplit} items, or succeeds in a way that is consistent * according to {@link #assertSplitAtFractionSucceedsAndConsistent}. * * <p>Returns SplitAtFractionResult. */ public static <T> SplitAtFractionResult assertSplitAtFractionBehavior( BoundedSource<T> source, int numItemsToReadBeforeSplit, double splitFraction, ExpectedSplitOutcome expectedOutcome, PipelineOptions options) throws Exception { return assertSplitAtFractionBehaviorImpl( source, readFromSource(source, options), numItemsToReadBeforeSplit, splitFraction, expectedOutcome, options); }
Example #5
Source File: UnboundedReadFromBoundedSourceTest.java From beam with Apache License 2.0 | 6 votes |
private <T> void testBoundedToUnboundedSourceAdapterCheckpoint( BoundedSource<T> boundedSource, List<T> expectedElements) throws Exception { BoundedToUnboundedSourceAdapter<T> unboundedSource = new BoundedToUnboundedSourceAdapter<>(boundedSource); PipelineOptions options = PipelineOptionsFactory.create(); BoundedToUnboundedSourceAdapter<T>.Reader reader = unboundedSource.createReader(options, null); List<T> actual = Lists.newArrayList(); for (boolean hasNext = reader.start(); hasNext; hasNext = reader.advance()) { actual.add(reader.getCurrent()); // checkpoint every 9 elements if (actual.size() % 9 == 0) { Checkpoint<T> checkpoint = reader.getCheckpointMark(); checkpoint.finalizeCheckpoint(); } } Checkpoint<T> checkpointDone = reader.getCheckpointMark(); assertTrue( checkpointDone.getResidualElements() == null || checkpointDone.getResidualElements().isEmpty()); assertEquals(expectedElements.size(), actual.size()); assertEquals(Sets.newHashSet(expectedElements), Sets.newHashSet(actual)); }
Example #6
Source File: SourceTestUtilsTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testToUnsplittableSource() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); BoundedSource<Long> baseSource = CountingSource.upTo(100); BoundedSource<Long> unsplittableSource = SourceTestUtils.toUnsplittableSource(baseSource); List<?> splits = unsplittableSource.split(1, options); assertEquals(1, splits.size()); assertEquals(unsplittableSource, splits.get(0)); BoundedReader<Long> unsplittableReader = unsplittableSource.createReader(options); assertEquals(0, unsplittableReader.getFractionConsumed(), 1e-15); Set<Long> expected = Sets.newHashSet(SourceTestUtils.readFromSource(baseSource, options)); Set<Long> actual = Sets.newHashSet(); actual.addAll(SourceTestUtils.readNItemsFromUnstartedReader(unsplittableReader, 40)); assertNull(unsplittableReader.splitAtFraction(0.5)); actual.addAll(SourceTestUtils.readRemainingFromReader(unsplittableReader, true /* started */)); assertEquals(1, unsplittableReader.getFractionConsumed(), 1e-15); assertEquals(100, actual.size()); assertEquals(Sets.newHashSet(expected), Sets.newHashSet(actual)); }
Example #7
Source File: BoundedSourceRunnerTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testRunReadLoopWithMultipleSources() throws Exception { List<WindowedValue<Long>> out1Values = new ArrayList<>(); List<WindowedValue<Long>> out2Values = new ArrayList<>(); Collection<FnDataReceiver<WindowedValue<Long>>> consumers = ImmutableList.of(out1Values::add, out2Values::add); BoundedSourceRunner<BoundedSource<Long>, Long> runner = new BoundedSourceRunner<>( PipelineOptionsFactory.create(), RunnerApi.FunctionSpec.getDefaultInstance(), consumers); runner.runReadLoop(valueInGlobalWindow(CountingSource.upTo(2))); runner.runReadLoop(valueInGlobalWindow(CountingSource.upTo(1))); assertThat( out1Values, contains(valueInGlobalWindow(0L), valueInGlobalWindow(1L), valueInGlobalWindow(0L))); assertThat( out2Values, contains(valueInGlobalWindow(0L), valueInGlobalWindow(1L), valueInGlobalWindow(0L))); }
Example #8
Source File: HadoopFormatIOReadTest.java From beam with Apache License 2.0 | 6 votes |
/** * This test validates records emitted in PCollection are immutable if InputFormat's {@link * org.apache.hadoop.mapreduce.RecordReader RecordReader} returns different objects (i.e. * different locations in memory). */ @Test public void testImmutablityOfOutputOfReadIfRecordReaderObjectsAreImmutable() throws Exception { List<BoundedSource<KV<Text, Employee>>> boundedSourceList = getBoundedSourceList( EmployeeInputFormat.class, Text.class, Employee.class, WritableCoder.of(Text.class), AvroCoder.of(Employee.class)); List<KV<Text, Employee>> bundleRecords = new ArrayList<>(); for (BoundedSource<KV<Text, Employee>> source : boundedSourceList) { List<KV<Text, Employee>> elems = SourceTestUtils.readFromSource(source, p.getOptions()); bundleRecords.addAll(elems); } List<KV<Text, Employee>> referenceRecords = TestEmployeeDataSet.getEmployeeData(); assertThat(bundleRecords, containsInAnyOrder(referenceRecords.toArray())); }
Example #9
Source File: XmlSourceTest.java From beam with Apache License 2.0 | 6 votes |
@Test public void testReadXMLTiny() throws IOException { File file = tempFolder.newFile("trainXMLTiny"); Files.write(file.toPath(), tinyXML.getBytes(StandardCharsets.UTF_8)); BoundedSource<Train> source = XmlIO.<Train>read() .from(file.toPath().toString()) .withRootElement("trains") .withRecordElement("train") .withRecordClass(Train.class) .withMinBundleSize(1024) .createSource(); List<Train> expectedResults = ImmutableList.of( new Train("Thomas", Train.TRAIN_NUMBER_UNDEFINED, null, null), new Train("Henry", Train.TRAIN_NUMBER_UNDEFINED, null, null), new Train("James", Train.TRAIN_NUMBER_UNDEFINED, null, null)); assertThat( trainsToStrings(expectedResults), containsInAnyOrder( trainsToStrings(readEverythingFromReader(source.createReader(null))).toArray())); }
Example #10
Source File: FileSourceBase.java From components with Apache License 2.0 | 6 votes |
@Override protected List<? extends BoundedSource<KV<K, V>>> doAsSplitIntoBundles(long desiredBundleSizeBytes, PipelineOptions options) throws Exception { // Re-implementation of the base class method to use the factory methods. long splitSize = limit >= 0 ? Math.max(desiredBundleSizeBytes, 10 * 1024 * 1024) : desiredBundleSizeBytes; if (serializableSplit == null) { return Lists.transform(computeSplits(splitSize), new Function<InputSplit, BoundedSource<KV<K, V>>>() { @Override public BoundedSource<KV<K, V>> apply(@Nullable InputSplit inputSplit) { return createSourceForSplit(new SerializableSplit(inputSplit)); } }); } else { return ImmutableList.of(this); } }
Example #11
Source File: UnboundedReadFromBoundedSource.java From beam with Apache License 2.0 | 5 votes |
Reader( @Nullable List<TimestampedValue<T>> residualElementsList, @Nullable BoundedSource<T> residualSource, PipelineOptions options) { init(residualElementsList, residualSource, options); this.options = checkNotNull(options, "options"); this.done = false; }
Example #12
Source File: TCompBoundedSourceAdapter.java From components with Apache License 2.0 | 5 votes |
@Override public List<? extends BoundedSource<IndexedRecord>> split(long desiredBundleSizeBytes, PipelineOptions options) throws Exception { List<? extends org.talend.components.api.component.runtime.BoundedSource> boundedSources = tCompSource .splitIntoBundles(desiredBundleSizeBytes, null); List<TCompBoundedSourceAdapter> sources = new ArrayList(); for (org.talend.components.api.component.runtime.BoundedSource boundedSource : boundedSources) { sources.add(new TCompBoundedSourceAdapter(boundedSource)); } return sources; }
Example #13
Source File: CreateTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testSourceSplitVoid() throws Exception { CreateSource<Void> source = CreateSource.fromIterable(Lists.newArrayList(null, null, null, null, null), VoidCoder.of()); PipelineOptions options = PipelineOptionsFactory.create(); List<? extends BoundedSource<Void>> splitSources = source.split(3, options); SourceTestUtils.assertSourcesEqualReferenceSource(source, splitSources, options); }
Example #14
Source File: CreateTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testSourceSplit() throws Exception { CreateSource<Integer> source = CreateSource.fromIterable( ImmutableList.of(1, 2, 3, 4, 5, 6, 7, 8), BigEndianIntegerCoder.of()); PipelineOptions options = PipelineOptionsFactory.create(); List<? extends BoundedSource<Integer>> splitSources = source.split(12, options); assertThat(splitSources, hasSize(3)); SourceTestUtils.assertSourcesEqualReferenceSource(source, splitSources, options); }
Example #15
Source File: ReadSourceTranslatorBatch.java From beam with Apache License 2.0 | 5 votes |
@Override public void translateNode(Read.Bounded<T> transform, Twister2BatchTranslationContext context) { BoundedSource<T> boundedSource = transform.getSource(); Twister2BoundedSource<T> twister2BoundedSource = new Twister2BoundedSource<T>(boundedSource, context, context.getOptions()); final TSetEnvironment tsetEnv = context.getEnvironment(); SourceTSet<WindowedValue<T>> sourceTSet = ((BatchTSetEnvironment) tsetEnv) .createSource(twister2BoundedSource, context.getOptions().getParallelism()); PCollection<T> output = context.getOutput(transform); context.setOutputDataSet(output, sourceTSet); }
Example #16
Source File: SourceTestUtils.java From beam with Apache License 2.0 | 5 votes |
/** * Asserts that the {@code source}'s reader fails to {@code splitAtFraction(fraction)} after * reading {@code numItemsToReadBeforeSplit} items. */ public static <T> void assertSplitAtFractionFails( BoundedSource<T> source, int numItemsToReadBeforeSplit, double splitFraction, PipelineOptions options) throws Exception { assertSplitAtFractionBehavior( source, numItemsToReadBeforeSplit, splitFraction, ExpectedSplitOutcome.MUST_FAIL, options); }
Example #17
Source File: HadoopFormatIOReadTest.java From beam with Apache License 2.0 | 5 votes |
private <K, V> List<BoundedSource<KV<K, V>>> getBoundedSourceList( Class<?> inputFormatClass, Class<K> inputFormatKeyClass, Class<V> inputFormatValueClass, Coder<K> keyCoder, Coder<V> valueCoder) throws Exception { HadoopInputFormatBoundedSource<K, V> boundedSource = getTestHIFSource( inputFormatClass, inputFormatKeyClass, inputFormatValueClass, keyCoder, valueCoder); return boundedSource.split(0, p.getOptions()); }
Example #18
Source File: WorkerCustomSourcesTest.java From beam with Apache License 2.0 | 5 votes |
@Override public List<? extends BoundedSource<Integer>> split( long desiredBundleSizeBytes, PipelineOptions options) throws Exception { Preconditions.checkState(errorMessage == null, "Unexpected invalid source"); return Arrays.asList( new SourceProducingInvalidSplits("goodBundle", null), new SourceProducingInvalidSplits("badBundle", "intentionally invalid")); }
Example #19
Source File: WorkItemStatusClientTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void populateSplitResultCustomReader() throws Exception { WorkItemStatus status = new WorkItemStatus(); statusClient.setWorker(worker, executionContext); BoundedSource<Integer> primary = new DummyBoundedSource(5); BoundedSource<Integer> residual = new DummyBoundedSource(10); BoundedSourceSplit<Integer> split = new BoundedSourceSplit<>(primary, residual); statusClient.populateSplitResult(status, split); assertThat(status.getDynamicSourceSplit(), equalTo(WorkerCustomSources.toSourceSplit(split))); assertThat(status.getStopPosition(), nullValue()); }
Example #20
Source File: XmlSourceTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testSplitAtFractionExhaustiveSingleByte() throws Exception { PipelineOptions options = PipelineOptionsFactory.create(); File file = tempFolder.newFile("trainXMLSmall"); Files.write(file.toPath(), trainXMLWithAllFeaturesSingleByte.getBytes(StandardCharsets.UTF_8)); BoundedSource<Train> source = XmlIO.<Train>read() .from(file.toPath().toString()) .withRootElement("trains") .withRecordElement("train") .withRecordClass(Train.class) .createSource(); assertSplitAtFractionExhaustive(source, options); }
Example #21
Source File: BigQueryIOStorageReadTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testTableSourceInitialSplit_EmptyTable() throws Exception { fakeDatasetService.createDataset("foo.com:project", "dataset", "", "", null); TableReference tableRef = BigQueryHelpers.parseTableSpec("foo.com:project:dataset.table"); Table table = new Table() .setTableReference(tableRef) .setNumBytes(1024L * 1024L) .setSchema(new TableSchema()); fakeDatasetService.createTable(table); CreateReadSessionRequest expectedRequest = CreateReadSessionRequest.newBuilder() .setParent("projects/project-id") .setTableReference(BigQueryHelpers.toTableRefProto(tableRef)) .setRequestedStreams(1024) .setShardingStrategy(ShardingStrategy.BALANCED) .build(); ReadSession emptyReadSession = ReadSession.newBuilder().build(); StorageClient fakeStorageClient = mock(StorageClient.class); when(fakeStorageClient.createReadSession(expectedRequest)).thenReturn(emptyReadSession); BigQueryStorageTableSource<TableRow> tableSource = BigQueryStorageTableSource.create( ValueProvider.StaticValueProvider.of(tableRef), null, null, null, new TableRowParser(), TableRowJsonCoder.of(), new FakeBigQueryServices() .withDatasetService(fakeDatasetService) .withStorageClient(fakeStorageClient)); List<? extends BoundedSource<TableRow>> sources = tableSource.split(1024L, options); assertTrue(sources.isEmpty()); }
Example #22
Source File: BoundedReadEvaluatorFactory.java From beam with Apache License 2.0 | 5 votes |
@Override public void processElement(WindowedValue<BoundedSourceShard<OutputT>> element) throws Exception { BoundedSource<OutputT> source = element.getValue().getSource(); try (final BoundedReader<OutputT> reader = source.createReader(options)) { boolean contentsRemaining = reader.start(); Future<BoundedSource<OutputT>> residualFuture = startDynamicSplitThread(source, reader); UncommittedBundle<OutputT> output = evaluationContext.createBundle(outputPCollection); while (contentsRemaining) { output.add( WindowedValue.timestampedValueInGlobalWindow( reader.getCurrent(), reader.getCurrentTimestamp())); contentsRemaining = reader.advance(); } resultBuilder.addOutput(output); try { BoundedSource<OutputT> residual = residualFuture.get(); if (residual != null) { resultBuilder.addUnprocessedElements( element.withValue(BoundedSourceShard.of(residual))); } } catch (ExecutionException exex) { // Un-and-rewrap the exception thrown by attempting to split throw UserCodeException.wrap(exex.getCause()); } } }
Example #23
Source File: UnboundedReadFromBoundedSource.java From beam with Apache License 2.0 | 5 votes |
Checkpoint<T> getCheckpointMark() { if (reader == null) { // Reader hasn't started, checkpoint the residualSource. return new Checkpoint<>(null /* residualElements */, residualSource); } else { // Part of residualSource are consumed. // Splits the residualSource and tracks the new residualElements in current source. BoundedSource<T> residualSplit = null; Double fractionConsumed = reader.getFractionConsumed(); if (fractionConsumed != null && 0 <= fractionConsumed && fractionConsumed <= 1) { double fractionRest = 1 - fractionConsumed; int splitAttempts = 8; for (int i = 0; i < 8 && residualSplit == null; ++i) { double fractionToSplit = fractionConsumed + fractionRest * i / splitAttempts; residualSplit = reader.splitAtFraction(fractionToSplit); } } List<TimestampedValue<T>> newResidualElements = Lists.newArrayList(); try { while (advance()) { newResidualElements.add( TimestampedValue.of(reader.getCurrent(), reader.getCurrentTimestamp())); } } catch (IOException e) { throw new RuntimeException("Failed to read elements from the bounded reader.", e); } return new Checkpoint<>(newResidualElements, residualSplit); } }
Example #24
Source File: UnboundedReadFromBoundedSource.java From beam with Apache License 2.0 | 5 votes |
public ResidualSource(BoundedSource<T> residualSource, PipelineOptions options) { this.residualSource = checkNotNull(residualSource, "residualSource"); this.options = checkNotNull(options, "options"); this.reader = null; this.closed = false; this.readerDone = false; }
Example #25
Source File: ReadTranslation.java From beam with Apache License 2.0 | 5 votes |
public static BoundedSource<?> boundedSourceFromProto(ReadPayload payload) throws InvalidProtocolBufferException { checkArgument(payload.getIsBounded().equals(IsBounded.Enum.BOUNDED)); return (BoundedSource<?>) SerializableUtils.deserializeFromByteArray( payload.getSource().getPayload().toByteArray(), "BoundedSource"); }
Example #26
Source File: XmlSourceTest.java From beam with Apache License 2.0 | 5 votes |
@Test @Ignore( "Multi-byte characters in XML are not supported because the parser " + "currently does not correctly report byte offsets") public void testReadXMLWithMultiByteElementName() throws IOException { File file = tempFolder.newFile("trainXMLTiny"); Files.write(file.toPath(), xmlWithMultiByteElementName.getBytes(StandardCharsets.UTF_8)); BoundedSource<Train> source = XmlIO.<Train>read() .from(file.toPath().toString()) .withRootElement("දුම්රියන්") .withRecordElement("දුම්රිය") .withRecordClass(Train.class) .withMinBundleSize(1024) .createSource(); List<Train> expectedResults = ImmutableList.of( new Train("Thomas", Train.TRAIN_NUMBER_UNDEFINED, null, null), new Train("Henry", Train.TRAIN_NUMBER_UNDEFINED, null, null), new Train("James", Train.TRAIN_NUMBER_UNDEFINED, null, null)); assertThat( trainsToStrings(expectedResults), containsInAnyOrder( trainsToStrings(readEverythingFromReader(source.createReader(null))).toArray())); }
Example #27
Source File: DirectRunnerTest.java From beam with Apache License 2.0 | 5 votes |
@Override public List<? extends BoundedSource<T>> split( long desiredBundleSizeBytes, PipelineOptions options) throws Exception { // Must have more than checkState( desiredBundleSizeBytes < getEstimatedSizeBytes(options), "Must split into more than one source"); return underlying.split(desiredBundleSizeBytes, options); }
Example #28
Source File: XmlSourceTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testReadXmlWithAdditionalFieldsShouldNotThrowException() throws IOException { File file = tempFolder.newFile("trainXMLSmall"); Files.write(file.toPath(), trainXML.getBytes(StandardCharsets.UTF_8)); BoundedSource<TinyTrain> source = XmlIO.<TinyTrain>read() .from(file.toPath().toString()) .withRootElement("trains") .withRecordElement("train") .withRecordClass(TinyTrain.class) .createSource(); List<TinyTrain> expectedResults = ImmutableList.of( new TinyTrain("Thomas"), new TinyTrain("Henry"), new TinyTrain("Toby"), new TinyTrain("Gordon"), new TinyTrain("Emily"), new TinyTrain("Percy")); assertThat( tinyTrainsToStrings(expectedResults), containsInAnyOrder( tinyTrainsToStrings(readEverythingFromReader(source.createReader(null))).toArray())); }
Example #29
Source File: XmlSourceTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testReadXMLNoBundleSize() throws IOException { File file = tempFolder.newFile("trainXMLSmall"); Files.write(file.toPath(), trainXML.getBytes(StandardCharsets.UTF_8)); BoundedSource<Train> source = XmlIO.<Train>read() .from(file.toPath().toString()) .withRootElement("trains") .withRecordElement("train") .withRecordClass(Train.class) .createSource(); List<Train> expectedResults = ImmutableList.of( new Train("Thomas", 1, "blue", null), new Train("Henry", 3, "green", null), new Train("Toby", 7, "brown", null), new Train("Gordon", 4, "blue", null), new Train("Emily", -1, "red", null), new Train("Percy", 6, "green", null)); assertThat( trainsToStrings(expectedResults), containsInAnyOrder( trainsToStrings(readEverythingFromReader(source.createReader(null))).toArray())); }
Example #30
Source File: UnboundedReadFromBoundedSourceTest.java From beam with Apache License 2.0 | 5 votes |
@Test public void testCountingSourceToUnboundedCheckpoint() throws Exception { long numElements = 100; BoundedSource<Long> countingSource = CountingSource.upTo(numElements); List<Long> expected = Lists.newArrayList(); for (long i = 0; i < numElements; ++i) { expected.add(i); } testBoundedToUnboundedSourceAdapterCheckpoint(countingSource, expected); }