Java Code Examples for org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#addSource()

The following examples show how to use org.apache.flink.streaming.api.environment.StreamExecutionEnvironment#addSource() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FlinkPravegaWriterITCase.java    From flink-connectors with Apache License 2.0 6 votes vote down vote up
@Test
public void testEventTimeOrderedWriter() throws Exception {
    StreamExecutionEnvironment execEnv = StreamExecutionEnvironment.createLocalEnvironment();

    Stream stream = Stream.of(SETUP_UTILS.getScope(), "testEventTimeOrderedWriter");
    SETUP_UTILS.createTestStream(stream.getStreamName(), 1);

    DataStreamSource<Integer> dataStream = execEnv
            .addSource(new IntegerGeneratingSource(false, EVENT_COUNT_PER_SOURCE));

    FlinkPravegaWriter<Integer> pravegaSink = FlinkPravegaWriter.<Integer>builder()
            .withPravegaConfig(SETUP_UTILS.getPravegaConfig())
            .forStream(stream)
            .withSerializationSchema(new IntSerializer())
            .withEventRouter(event -> "fixedkey")
            .build();

    FlinkPravegaUtils.writeToPravegaInEventTimeOrder(dataStream, pravegaSink, 1);
    Assert.assertNotNull(execEnv.getExecutionPlan());
}
 
Example 2
Source File: ElasticsearchSinkTestBase.java    From flink with Apache License 2.0 6 votes vote down vote up
private void runElasticSearchSinkTest(String index, Function<String, ElasticsearchSinkFunction<Tuple2<Integer, String>>> functionFactory) throws Exception {
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

	DataStreamSource<Tuple2<Integer, String>> source = env.addSource(new SourceSinkDataTestKit.TestDataSourceFunction());

	source.addSink(createElasticsearchSinkForEmbeddedNode(
			1,
			CLUSTER_NAME,
			functionFactory.apply(index)));

	env.execute("Elasticsearch Sink Test");

	// verify the results
	Client client = elasticsearchResource.getClient();
	SourceSinkDataTestKit.verifyProducedSinkData(client, index);

	client.close();
}
 
Example 3
Source File: OrcBulkWriterITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testOrcBulkWriter() throws Exception {
	final File outDir = TEMPORARY_FOLDER.newFolder();
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	final Properties writerProps = new Properties();
	writerProps.setProperty("orc.compress", "LZ4");

	final OrcBulkWriterFactory<Record> factory = new OrcBulkWriterFactory<>(
		new RecordVectorizer(schema), writerProps, new Configuration());

	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<Record> stream = env.addSource(new FiniteTestSource<>(testData), TypeInformation.of(Record.class));
	stream.map(str -> str)
		.addSink(StreamingFileSink
			.forBulkFormat(new Path(outDir.toURI()), factory)
			.build());

	env.execute();

	OrcBulkWriterTestUtil.validate(outDir, testData);
}
 
Example 4
Source File: WriteIntoKafka.java    From kafka-example with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	// create execution environment
	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

	// parse user parameters
	ParameterTool parameterTool = ParameterTool.fromArgs(args);

	// add a simple source which is writing some strings
	DataStream<String> messageStream = env.addSource(new SimpleStringGenerator());

	// write stream to Kafka
	messageStream.addSink(new KafkaSink<>(parameterTool.getRequired("bootstrap.servers"),
			parameterTool.getRequired("topic"),
			new SimpleStringSchema()));

	env.execute();
}
 
Example 5
Source File: IncrementalLearningSkeleton.java    From flink-learning with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

        DataStream<Integer> trainingData = env.addSource(new FiniteTrainingDataSource());
        DataStream<Integer> newData = env.addSource(new FiniteNewDataSource());

        DataStream<Double[]> model = trainingData
                .assignTimestampsAndWatermarks(new LinearTimestamp())
                .timeWindowAll(Time.of(5000, TimeUnit.MILLISECONDS))
                .apply(new PartialModelBuilder());

        newData.connect(model).map(new Predictor()).print();

        env.execute("Streaming Incremental Learning");
    }
 
Example 6
Source File: TwoInputBenchmark.java    From flink-benchmarks with Apache License 2.0 6 votes vote down vote up
@Benchmark
@OperationsPerInvocation(value = TwoInputBenchmark.RECORDS_PER_INVOCATION)
public void twoInputMapSink(FlinkEnvironmentContext context) throws Exception {

	StreamExecutionEnvironment env = context.env;

	env.enableCheckpointing(CHECKPOINT_INTERVAL_MS);
	env.setParallelism(1);

	// Setting buffer timeout to 1 is an attempt to improve twoInputMapSink benchmark stability.
	// Without 1ms buffer timeout, some JVM forks are much slower then others, making results
	// unstable and unreliable.
	env.setBufferTimeout(1);

	long numRecordsPerInput = RECORDS_PER_INVOCATION / 2;
	DataStreamSource<Long> source1 = env.addSource(new LongSource(numRecordsPerInput));
	DataStreamSource<Long> source2 = env.addSource(new LongSource(numRecordsPerInput));

	source1
		.connect(source2)
		.transform("custom operator", TypeInformation.of(Long.class), new MultiplyByTwoCoStreamMap())
		.addSink(new DiscardingSink<>());

	env.execute();
}
 
Example 7
Source File: ConsumeFromKinesis.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	ParameterTool pt = ParameterTool.fromArgs(args);

	StreamExecutionEnvironment see = StreamExecutionEnvironment.getExecutionEnvironment();
	see.setParallelism(1);

	Properties kinesisConsumerConfig = new Properties();
	kinesisConsumerConfig.setProperty(ConsumerConfigConstants.AWS_REGION, pt.getRequired("region"));
	kinesisConsumerConfig.setProperty(ConsumerConfigConstants.AWS_ACCESS_KEY_ID, pt.getRequired("accesskey"));
	kinesisConsumerConfig.setProperty(ConsumerConfigConstants.AWS_SECRET_ACCESS_KEY, pt.getRequired("secretkey"));

	DataStream<String> kinesis = see.addSource(new FlinkKinesisConsumer<>(
		"flink-test",
		new SimpleStringSchema(),
		kinesisConsumerConfig));

	kinesis.print();

	see.execute();
}
 
Example 8
Source File: TwoInputBenchmark.java    From flink-benchmarks with Apache License 2.0 6 votes vote down vote up
@Benchmark
@OperationsPerInvocation(value = TwoInputBenchmark.ONE_IDLE_RECORDS_PER_INVOCATION)
public void twoInputOneIdleMapSink(FlinkEnvironmentContext context) throws Exception {

	StreamExecutionEnvironment env = context.env;
	env.enableCheckpointing(CHECKPOINT_INTERVAL_MS);
	env.setParallelism(1);

	QueuingLongSource.reset();
	DataStreamSource<Long> source1 = env.addSource(new QueuingLongSource(1, ONE_IDLE_RECORDS_PER_INVOCATION - 1));
	DataStreamSource<Long> source2 = env.addSource(new QueuingLongSource(2, 1));

	source1
			.connect(source2)
			.transform("custom operator", TypeInformation.of(Long.class), new MultiplyByTwoCoStreamMap())
			.addSink(new DiscardingSink<>());

	env.execute();
}
 
Example 9
Source File: FlinkKafkaSinkExample.java    From huaweicloud-cs-sdk with Apache License 2.0 5 votes vote down vote up
public void writeKafka() {
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(1);
    DataStream<String> messageStream = env.addSource(new KafkaSourceGenerator());
    messageStream.addSink(new FlinkKafkaProducer010<String>(topic,
            new SimpleStringSchema(),
            properties));
    try {
        env.execute();
    }catch(Exception e) {
        System.out.println(e.getMessage());
    }
}
 
Example 10
Source File: CheckpointedStreamingProgram.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

	env.getConfig().disableSysoutLogging();
	env.enableCheckpointing(CHECKPOINT_INTERVALL);
	env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 100L));
	env.disableOperatorChaining();

	DataStream<String> text = env.addSource(new SimpleStringGenerator());
	text.map(new StatefulMapper()).addSink(new NoOpSink());
	env.setParallelism(1);
	env.execute("Checkpointed Streaming Program");
}
 
Example 11
Source File: SequenceStreamingFileSinkITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteSequenceFile() throws Exception {
	final File folder = TEMPORARY_FOLDER.newFolder();
	final Path testPath = Path.fromLocalFile(folder);

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<Tuple2<Long, String>> stream = env.addSource(
			new FiniteTestSource<>(testData),
			TypeInformation.of(new TypeHint<Tuple2<Long, String>>() {

			})
	);

	stream.map(new MapFunction<Tuple2<Long, String>, Tuple2<LongWritable, Text>>() {
		@Override
		public Tuple2<LongWritable, Text> map(Tuple2<Long, String> value) throws Exception {
			return new Tuple2<>(new LongWritable(value.f0), new Text(value.f1));
		}
	}).addSink(
		StreamingFileSink.forBulkFormat(
			testPath,
			new SequenceFileWriterFactory<>(configuration, LongWritable.class, Text.class, "BZip2")
		).build());

	env.execute();

	validateResults(folder, testData);
}
 
Example 12
Source File: FlinkKafkaSourceExample.java    From huaweicloud-cs-sdk with Apache License 2.0 5 votes vote down vote up
public void readKafka() {
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(1);
    DataStream<String> messageStream = env.addSource(new FlinkKafkaConsumer010<String>(topic,
            new SimpleStringSchema(),
            properties));
    messageStream.rebalance().print();
    try {
        env.execute();
    }catch(Exception e) {
        System.out.println(e.getMessage());
    }
}
 
Example 13
Source File: StateCheckpointedITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Runs the following program.
 * <pre>
 *     [ (source)->(filter)] -> [ (map) -> (map) ] -> [ (groupBy/reduce)->(sink) ]
 * </pre>
 */
@Override
public void testProgram(StreamExecutionEnvironment env) {
	assertTrue("Broken test setup", NUM_STRINGS % 40 == 0);

	final long failurePosMin = (long) (0.4 * NUM_STRINGS / PARALLELISM);
	final long failurePosMax = (long) (0.7 * NUM_STRINGS / PARALLELISM);

	final long failurePos = (new Random().nextLong() % (failurePosMax - failurePosMin)) + failurePosMin;

	env.enableCheckpointing(200);

	DataStream<String> stream = env.addSource(new StringGeneratingSourceFunction(NUM_STRINGS));

	stream
			// first vertex, chained to the source
			// this filter throttles the flow until at least one checkpoint
			// is complete, to make sure this program does not run without
			.filter(new StringRichFilterFunction())

					// -------------- seconds vertex - one-to-one connected ----------------
			.map(new StringPrefixCountRichMapFunction())
			.startNewChain()
			.map(new StatefulCounterFunction())

					// -------------- third vertex - reducer and the sink ----------------
			.keyBy("prefix")
			.flatMap(new OnceFailingAggregator(failurePos))
			.addSink(new ValidatingSink());
}
 
Example 14
Source File: SyntheticSources.java    From da-streamingledger with Apache License 2.0 5 votes vote down vote up
/**
 * Creates and adds two synthetic sources for {@link DepositEvent} and {@link TransactionEvent}.
 *
 * @param env              the streaming environment to add the sources to.
 * @param recordsPerSecond the number of {@link TransactionEvent} per second to generate.
 * @return a {@link DataStream} for each event type generated.
 */
public static SyntheticSources create(StreamExecutionEnvironment env, int recordsPerSecond) {

    final DataStreamSource<Either<DepositEvent, TransactionEvent>> depositsAndTransactions = env.addSource(
            new DepositsThenTransactionsSource(recordsPerSecond));

    final OutputTag<TransactionEvent> transactionsSideOutput = new OutputTag<>(
            "transactions side output",
            TypeInformation.of(TransactionEvent.class));

    final SingleOutputStreamOperator<DepositEvent> deposits = depositsAndTransactions.process(
            new ProcessFunction<Either<DepositEvent, TransactionEvent>, DepositEvent>() {

                @Override
                public void processElement(
                        Either<DepositEvent, TransactionEvent> depositOrTransaction,
                        Context context,
                        Collector<DepositEvent> out) {

                    if (depositOrTransaction.isLeft()) {
                        out.collect(depositOrTransaction.left());
                    }
                    else {
                        context.output(transactionsSideOutput, depositOrTransaction.right());
                    }
                }
            });

    final DataStream<TransactionEvent> transactions = deposits.getSideOutput(transactionsSideOutput);

    return new SyntheticSources(deposits, transactions);
}
 
Example 15
Source File: StreamTaskSelectiveReadingITCase.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testSequentialReading() throws Exception {

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);

	DataStream<String> source0 = env.addSource(
		new TestStringSource("Source0",
			new String[] {
				"Hello-1", "Hello-2", "Hello-3", "Hello-4", "Hello-5", "Hello-6"
		}));
	DataStream<Integer> source1 = env.addSource(
		new TestIntegerSource("Source1",
			new Integer[] {
				1, 2, 3
			}))
		.setParallelism(2);
	TestListResultSink<String> resultSink = new TestListResultSink<>();

	TestSequentialReadingStreamOperator twoInputStreamOperator = new TestSequentialReadingStreamOperator("Operator0");
	twoInputStreamOperator.setChainingStrategy(ChainingStrategy.NEVER);

	source0.connect(source1)
		.transform(
			"Custom Operator",
			BasicTypeInfo.STRING_TYPE_INFO,
			twoInputStreamOperator
		)
		.addSink(resultSink);

	env.execute("Selective reading test");

	List<String> result = resultSink.getResult();

	List<String> expected1 = Arrays.asList(
		"[Operator0-1]: [Source0-0]: Hello-1",
		"[Operator0-1]: [Source0-0]: Hello-2",
		"[Operator0-1]: [Source0-0]: Hello-3",
		"[Operator0-1]: [Source0-0]: Hello-4",
		"[Operator0-1]: [Source0-0]: Hello-5",
		"[Operator0-1]: [Source0-0]: Hello-6"
	);

	List<String> expected2 = Arrays.asList(
		"[Operator0-2]: 1",
		"[Operator0-2]: 2",
		"[Operator0-2]: 3",
		"[Operator0-2]: 2",
		"[Operator0-2]: 4",
		"[Operator0-2]: 6"
	);
	Collections.sort(expected2);

	assertEquals(expected1.size() + expected2.size(), result.size());
	assertEquals(expected1, result.subList(0, expected1.size()));

	List<String> result2 = result.subList(expected1.size(), expected1.size() + expected2.size());
	Collections.sort(result2);
	assertEquals(expected2, result2);
}
 
Example 16
Source File: ExactlyOnceValidatingConsumerThread.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
public static Thread create(final int totalEventCount,
							final int failAtRecordCount,
							final int parallelism,
							final int checkpointInterval,
							final long restartDelay,
							final String awsAccessKey,
							final String awsSecretKey,
							final String awsRegion,
							final String kinesisStreamName,
							final AtomicReference<Throwable> errorHandler,
							final int flinkPort,
							final Configuration flinkConfig) {
	Runnable exactlyOnceValidationConsumer = new Runnable() {
		@Override
		public void run() {
			try {
				StreamExecutionEnvironment see = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort, flinkConfig);
				see.setParallelism(parallelism);
				see.enableCheckpointing(checkpointInterval);
				// we restart two times
				see.setRestartStrategy(RestartStrategies.fixedDelayRestart(2, restartDelay));

				// consuming topology
				Properties consumerProps = new Properties();
				consumerProps.setProperty(ConsumerConfigConstants.AWS_ACCESS_KEY_ID, awsAccessKey);
				consumerProps.setProperty(ConsumerConfigConstants.AWS_SECRET_ACCESS_KEY, awsSecretKey);
				consumerProps.setProperty(ConsumerConfigConstants.AWS_REGION, awsRegion);
				// start reading from beginning
				consumerProps.setProperty(ConsumerConfigConstants.STREAM_INITIAL_POSITION, ConsumerConfigConstants.InitialPosition.TRIM_HORIZON.name());
				DataStream<String> consuming = see.addSource(new FlinkKinesisConsumer<>(kinesisStreamName, new SimpleStringSchema(), consumerProps));
				consuming
					.flatMap(new ArtificialFailOnceFlatMapper(failAtRecordCount))
					// validate consumed records for correctness (use only 1 instance to validate all consumed records)
					.flatMap(new ExactlyOnceValidatingMapper(totalEventCount)).setParallelism(1);

				LOG.info("Starting consuming topology");
				tryExecute(see, "Consuming topo");
				LOG.info("Consuming topo finished");
			} catch (Exception e) {
				LOG.warn("Error while running consuming topology", e);
				errorHandler.set(e);
			}
		}
	};

	return new Thread(exactlyOnceValidationConsumer);
}
 
Example 17
Source File: CassandraPojoSink.java    From blog_demos with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

    //设置并行度
    env.setParallelism(1);

    //连接kafka用到的属性对象
    Properties properties = new Properties();
    //broker地址
    properties.setProperty("bootstrap.servers", "192.168.50.43:9092");
    //zookeeper地址
    properties.setProperty("zookeeper.connect", "192.168.50.43:2181");
    //消费者的groupId
    properties.setProperty("group.id", "flink-connector");
    //实例化Consumer类
    FlinkKafkaConsumer<String> flinkKafkaConsumer = new FlinkKafkaConsumer<>(
            "test001",
            new SimpleStringSchema(),
            properties
    );

    //指定从最新位置开始消费,相当于放弃历史消息
    flinkKafkaConsumer.setStartFromLatest();

    //通过addSource方法得到DataSource
    DataStream<String> dataStream = env.addSource(flinkKafkaConsumer);

    DataStream<WordCount> result = dataStream
            .flatMap(new FlatMapFunction<String, WordCount>() {
                @Override
                public void flatMap(String s, Collector<WordCount> collector) throws Exception {
                    String[] words = s.toLowerCase().split("\\s");

                    for (String word : words) {
                        if (!word.isEmpty()) {
                            //cassandra的表中,每个word都是主键,因此不能为空
                            collector.collect(new WordCount(word, 1L));
                        }
                    }
                }
            })
            .keyBy("word")
            .timeWindow(Time.seconds(5))
            .reduce(new ReduceFunction<WordCount>() {
                @Override
                public WordCount reduce(WordCount wordCount, WordCount t1) throws Exception {
                    return new WordCount(wordCount.getWord(), wordCount.getCount() + t1.getCount());
                }
            });

    result.addSink(new PrintSinkFunction<>())
            .name("print Sink")
            .disableChaining();

    CassandraSink.addSink(result)
            .setHost("192.168.133.168")
            .setMapperOptions(() -> new Mapper.Option[] { Mapper.Option.saveNullFields(true) })
            .build()
            .name("cassandra Sink")
            .disableChaining();

    env.execute("kafka-2.4 source, cassandra-3.11.6 sink, pojo");
}
 
Example 18
Source File: TimestampITCase.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * These check whether custom timestamp emission works at sources and also whether timestamps
 * arrive at operators throughout a topology.
 *
 * <p>This also checks whether watermarks keep propagating if a source closes early.
 *
 * <p>This only uses map to test the workings of watermarks in a complete, running topology. All
 * tasks and stream operators have dedicated tests that test the watermark propagation
 * behaviour.
 */
@Test
public void testWatermarkPropagation() throws Exception {
	final int numWatermarks = 10;

	long initialTime = 0L;

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

	env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
	env.setParallelism(PARALLELISM);

	DataStream<Integer> source1 = env.addSource(new MyTimestampSource(initialTime, numWatermarks));
	DataStream<Integer> source2 = env.addSource(new MyTimestampSource(initialTime, numWatermarks / 2));

	source1.union(source2)
			.map(new IdentityMap())
			.connect(source2).map(new IdentityCoMap())
			.transform("Custom Operator", BasicTypeInfo.INT_TYPE_INFO, new CustomOperator(true))
			.addSink(new DiscardingSink<Integer>());

	env.execute();

	// verify that all the watermarks arrived at the final custom operator
	for (int i = 0; i < PARALLELISM; i++) {
		// we are only guaranteed to see NUM_WATERMARKS / 2 watermarks because the
		// other source stops emitting after that
		for (int j = 0; j < numWatermarks / 2; j++) {
			if (!CustomOperator.finalWatermarks[i].get(j).equals(new Watermark(initialTime + j))) {
				System.err.println("All Watermarks: ");
				for (int k = 0; k <= numWatermarks / 2; k++) {
					System.err.println(CustomOperator.finalWatermarks[i].get(k));
				}

				fail("Wrong watermark.");
			}
		}

		assertEquals(Watermark.MAX_WATERMARK,
				CustomOperator.finalWatermarks[i].get(CustomOperator.finalWatermarks[i].size() - 1));
	}
}
 
Example 19
Source File: ContinuousFileProcessingITCase.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testProgram() throws Exception {

	/*
	* This test checks the interplay between the monitor and the reader
	* and also the failExternally() functionality. To test the latter we
	* set the parallelism to 1 so that we have the chaining between the sink,
	* which throws the SuccessException to signal the end of the test, and the
	* reader.
	* */

	TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
	format.setFilePath(hdfsURI);
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	// create the stream execution environment with a parallelism > 1 to test
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(PARALLELISM);

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format,
			FileProcessingMode.PROCESS_CONTINUOUSLY,
			env.getParallelism(), INTERVAL);

	// the monitor has always DOP 1
	DataStream<TimestampedFileInputSplit> splits = env.addSource(monitoringFunction);
	Assert.assertEquals(1, splits.getParallelism());

	TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);

	// the readers can be multiple
	DataStream<String> content = splits.transform("FileSplitReader", typeInfo, new ContinuousFileReaderOperatorFactory<>(format));
	Assert.assertEquals(PARALLELISM, content.getParallelism());

	// finally for the sink we set the parallelism to 1 so that we can verify the output
	TestingSinkFunction sink = new TestingSinkFunction();
	content.addSink(sink).setParallelism(1);

	CompletableFuture<Void> jobFuture = new CompletableFuture<>();
	new Thread(() -> {
		try {
			env.execute("ContinuousFileProcessingITCase Job.");
			jobFuture.complete(null);
		} catch (Exception e) {
			if (ExceptionUtils.findThrowable(e, SuccessException.class).isPresent()) {
				jobFuture.complete(null);
			} else {
				jobFuture.completeExceptionally(e);
			}
		}
	}).start();

	// The modification time of the last created file.
	long lastCreatedModTime = Long.MIN_VALUE;

	// create the files to be read
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> tmpFile;
		long modTime;
		do {

			// give it some time so that the files have
			// different modification timestamps.
			Thread.sleep(50);

			tmpFile = fillWithData(hdfsURI, "file", i, "This is test line.");

			modTime = hdfs.getFileStatus(tmpFile.f0).getModificationTime();
			if (modTime <= lastCreatedModTime) {
				// delete the last created file to recreate it with a different timestamp
				hdfs.delete(tmpFile.f0, false);
			}
		} while (modTime <= lastCreatedModTime);
		lastCreatedModTime = modTime;

		// put the contents in the expected results list before the reader picks them
		// this is to guarantee that they are in before the reader finishes (avoid race conditions)
		expectedContents.put(i, tmpFile.f1);

		org.apache.hadoop.fs.Path file =
			new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
		hdfs.rename(tmpFile.f0, file);
		Assert.assertTrue(hdfs.exists(file));
	}

	jobFuture.get();
}
 
Example 20
Source File: ExactlyOnceValidatingConsumerThread.java    From flink with Apache License 2.0 4 votes vote down vote up
public static Thread create(final int totalEventCount,
							final int failAtRecordCount,
							final int parallelism,
							final int checkpointInterval,
							final long restartDelay,
							final String awsAccessKey,
							final String awsSecretKey,
							final String awsRegion,
							final String kinesisStreamName,
							final AtomicReference<Throwable> errorHandler,
							final int flinkPort,
							final Configuration flinkConfig) {
	Runnable exactlyOnceValidationConsumer = new Runnable() {
		@Override
		public void run() {
			try {
				StreamExecutionEnvironment see = StreamExecutionEnvironment.createRemoteEnvironment("localhost", flinkPort, flinkConfig);
				see.setParallelism(parallelism);
				see.enableCheckpointing(checkpointInterval);
				// we restart two times
				see.setRestartStrategy(RestartStrategies.fixedDelayRestart(2, restartDelay));

				// consuming topology
				Properties consumerProps = new Properties();
				consumerProps.setProperty(ConsumerConfigConstants.AWS_ACCESS_KEY_ID, awsAccessKey);
				consumerProps.setProperty(ConsumerConfigConstants.AWS_SECRET_ACCESS_KEY, awsSecretKey);
				consumerProps.setProperty(ConsumerConfigConstants.AWS_REGION, awsRegion);
				// start reading from beginning
				consumerProps.setProperty(ConsumerConfigConstants.STREAM_INITIAL_POSITION, ConsumerConfigConstants.InitialPosition.TRIM_HORIZON.name());
				DataStream<String> consuming = see.addSource(new FlinkKinesisConsumer<>(kinesisStreamName, new SimpleStringSchema(), consumerProps));
				consuming
					.flatMap(new ArtificialFailOnceFlatMapper(failAtRecordCount))
					// validate consumed records for correctness (use only 1 instance to validate all consumed records)
					.flatMap(new ExactlyOnceValidatingMapper(totalEventCount)).setParallelism(1);

				LOG.info("Starting consuming topology");
				tryExecute(see, "Consuming topo");
				LOG.info("Consuming topo finished");
			} catch (Exception e) {
				LOG.warn("Error while running consuming topology", e);
				errorHandler.set(e);
			}
		}
	};

	return new Thread(exactlyOnceValidationConsumer);
}