org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction Java Exaples

Source File: ContinuousFileProcessingTest.java From Flink-CEPplus with Apache License 2.0

6 votes

@Test
public void testInvalidPathSpecification() throws Exception {

	String invalidPath = "hdfs://" + hdfsCluster.getURI().getHost() + ":" + hdfsCluster.getNameNodePort() + "/invalid/";
	TextInputFormat format = new TextInputFormat(new Path(invalidPath));

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format,
			FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
	try {
		monitoringFunction.run(new DummySourceContext() {
			@Override
			public void collect(TimestampedFileInputSplit element) {
				// we should never arrive here with an invalid path
				Assert.fail("Test passes with an invalid path.");
			}
		});

		// we should never arrive here with an invalid path
		Assert.fail("Test passed with an invalid path.");

	} catch (FileNotFoundException e) {
		Assert.assertEquals("The provided file path " + format.getFilePath() + " does not exist.", e.getMessage());
	}
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

6 votes

@Test
public void testInvalidPathSpecification() throws Exception {

	String invalidPath = "hdfs://" + hdfsCluster.getURI().getHost() + ":" + hdfsCluster.getNameNodePort() + "/invalid/";
	TextInputFormat format = new TextInputFormat(new Path(invalidPath));

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format,
			FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
	try {
		monitoringFunction.run(new DummySourceContext() {
			@Override
			public void collect(TimestampedFileInputSplit element) {
				// we should never arrive here with an invalid path
				Assert.fail("Test passes with an invalid path.");
			}
		});

		// we should never arrive here with an invalid path
		Assert.fail("Test passed with an invalid path.");

	} catch (FileNotFoundException e) {
		Assert.assertEquals("The provided file path " + format.getFilePath() + " does not exist.", e.getMessage());
	}
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

6 votes

@Test
public void testInvalidPathSpecification() throws Exception {

	String invalidPath = "hdfs://" + hdfsCluster.getURI().getHost() + ":" + hdfsCluster.getNameNodePort() + "/invalid/";
	TextInputFormat format = new TextInputFormat(new Path(invalidPath));

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format,
			FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
	try {
		monitoringFunction.run(new DummySourceContext() {
			@Override
			public void collect(TimestampedFileInputSplit element) {
				// we should never arrive here with an invalid path
				Assert.fail("Test passes with an invalid path.");
			}
		});

		// we should never arrive here with an invalid path
		Assert.fail("Test passed with an invalid path.");

	} catch (FileNotFoundException e) {
		Assert.assertEquals("The provided file path " + format.getFilePath() + " does not exist.", e.getMessage());
	}
}

Source File: StreamExecutionEnvironment.java From Flink-CEPplus with Apache License 2.0

5 votes

private <OUT> DataStreamSource<OUT> createFileInput(FileInputFormat<OUT> inputFormat,
													TypeInformation<OUT> typeInfo,
													String sourceName,
													FileProcessingMode monitoringMode,
													long interval) {

	Preconditions.checkNotNull(inputFormat, "Unspecified file input format.");
	Preconditions.checkNotNull(typeInfo, "Unspecified output type information.");
	Preconditions.checkNotNull(sourceName, "Unspecified name for the source.");
	Preconditions.checkNotNull(monitoringMode, "Unspecified monitoring mode.");

	Preconditions.checkArgument(monitoringMode.equals(FileProcessingMode.PROCESS_ONCE) ||
			interval >= ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL,
		"The path monitoring interval cannot be less than " +
				ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL + " ms.");

	ContinuousFileMonitoringFunction<OUT> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(inputFormat, monitoringMode, getParallelism(), interval);

	ContinuousFileReaderOperator<OUT> reader =
		new ContinuousFileReaderOperator<>(inputFormat);

	SingleOutputStreamOperator<OUT> source = addSource(monitoringFunction, sourceName)
			.transform("Split Reader: " + sourceName, typeInfo, reader);

	return new DataStreamSource<>(source);
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

5 votes

/**
 * Create continuous monitoring function with 1 reader-parallelism and interval: {@link #INTERVAL}.
 */
private <OUT> ContinuousFileMonitoringFunction<OUT> createTestContinuousFileMonitoringFunction(FileInputFormat<OUT> format, FileProcessingMode fileProcessingMode) {
	ContinuousFileMonitoringFunction<OUT> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format, fileProcessingMode, 1, INTERVAL);
	monitoringFunction.setRuntimeContext(Mockito.mock(RuntimeContext.class));
	return monitoringFunction;
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

5 votes

FileVerifyingSourceContext(OneShotLatch latch,
						ContinuousFileMonitoringFunction src,
						int elementsBeforeNotifying,
						int elementsBeforeCanceling) {
	this.latch = latch;
	this.seenFiles = new TreeSet<>();
	this.src = src;
	this.elementsBeforeNotifying = elementsBeforeNotifying;
	this.elementsBeforeCanceling = elementsBeforeCanceling;
}

Source File: StreamExecutionEnvironment.java From flink with Apache License 2.0

5 votes

private <OUT> DataStreamSource<OUT> createFileInput(FileInputFormat<OUT> inputFormat,
													TypeInformation<OUT> typeInfo,
													String sourceName,
													FileProcessingMode monitoringMode,
													long interval) {

	Preconditions.checkNotNull(inputFormat, "Unspecified file input format.");
	Preconditions.checkNotNull(typeInfo, "Unspecified output type information.");
	Preconditions.checkNotNull(sourceName, "Unspecified name for the source.");
	Preconditions.checkNotNull(monitoringMode, "Unspecified monitoring mode.");

	Preconditions.checkArgument(monitoringMode.equals(FileProcessingMode.PROCESS_ONCE) ||
			interval >= ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL,
		"The path monitoring interval cannot be less than " +
				ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL + " ms.");

	ContinuousFileMonitoringFunction<OUT> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(inputFormat, monitoringMode, getParallelism(), interval);

	ContinuousFileReaderOperator<OUT> reader =
		new ContinuousFileReaderOperator<>(inputFormat);

	SingleOutputStreamOperator<OUT> source = addSource(monitoringFunction, sourceName)
			.transform("Split Reader: " + sourceName, typeInfo, reader);

	return new DataStreamSource<>(source);
}

Source File: HiveTableSource.java From flink with Apache License 2.0

5 votes

private DataStream<RowData> createStreamSourceForNonPartitionTable(
		StreamExecutionEnvironment execEnv,
		TypeInformation<RowData> typeInfo,
		HiveTableInputFormat inputFormat,
		HiveTablePartition hiveTable) {
	HiveTableFileInputFormat fileInputFormat = new HiveTableFileInputFormat(inputFormat, hiveTable);

	Configuration configuration = new Configuration();
	catalogTable.getOptions().forEach(configuration::setString);
	String consumeOrderStr = configuration.get(STREAMING_SOURCE_CONSUME_ORDER);
	ConsumeOrder consumeOrder = ConsumeOrder.getConsumeOrder(consumeOrderStr);
	if (consumeOrder != ConsumeOrder.CREATE_TIME_ORDER) {
		throw new UnsupportedOperationException(
				"Only " + ConsumeOrder.CREATE_TIME_ORDER + " is supported for non partition table.");
	}

	String consumeOffset = configuration.get(STREAMING_SOURCE_CONSUME_START_OFFSET);
	// to Local zone mills instead of UTC mills
	long currentReadTime = TimestampData.fromLocalDateTime(toLocalDateTime(consumeOffset))
			.toTimestamp().getTime();

	Duration monitorInterval = configuration.get(STREAMING_SOURCE_MONITOR_INTERVAL);

	ContinuousFileMonitoringFunction<RowData> monitoringFunction =
			new ContinuousFileMonitoringFunction<>(
					fileInputFormat,
					FileProcessingMode.PROCESS_CONTINUOUSLY,
					execEnv.getParallelism(),
					monitorInterval.toMillis(),
					currentReadTime);

	ContinuousFileReaderOperatorFactory<RowData, TimestampedFileInputSplit> factory =
			new ContinuousFileReaderOperatorFactory<>(fileInputFormat);

	String sourceName = "HiveFileMonitoringFunction";
	SingleOutputStreamOperator<RowData> source = execEnv.addSource(monitoringFunction, sourceName)
			.transform("Split Reader: " + sourceName, typeInfo, factory);

	return new DataStreamSource<>(source);
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

5 votes

@Test
public void testSortingOnModTime() throws Exception {
	String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";

	final long[] modTimes = new long[NO_OF_FILES];
	final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];

	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> file =
			createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
		Thread.sleep(400);

		filesCreated[i] = file.f0;
		modTimes[i] = hdfs.getFileStatus(file.f0).getModificationTime();
	}

	TextInputFormat format = new TextInputFormat(new Path(testBasePath));
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	// this is just to verify that all splits have been forwarded later.
	FileInputSplit[] splits = format.createInputSplits(1);

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_ONCE);

	ModTimeVerifyingSourceContext context = new ModTimeVerifyingSourceContext(modTimes);

	monitoringFunction.open(new Configuration());
	monitoringFunction.run(context);
	Assert.assertEquals(splits.length, context.getCounter());

	// delete the created files.
	for (int i = 0; i < NO_OF_FILES; i++) {
		hdfs.delete(filesCreated[i], false);
	}
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

5 votes

@Test
public void testSortingOnModTime() throws Exception {
	String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";

	final long[] modTimes = new long[NO_OF_FILES];
	final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];

	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> file =
			createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
		Thread.sleep(400);

		filesCreated[i] = file.f0;
		modTimes[i] = hdfs.getFileStatus(file.f0).getModificationTime();
	}

	TextInputFormat format = new TextInputFormat(new Path(testBasePath));
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	// this is just to verify that all splits have been forwarded later.
	FileInputSplit[] splits = format.createInputSplits(1);

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_ONCE);

	ModTimeVerifyingSourceContext context = new ModTimeVerifyingSourceContext(modTimes);

	monitoringFunction.open(new Configuration());
	monitoringFunction.run(context);
	Assert.assertEquals(splits.length, context.getCounter());

	// delete the created files.
	for (int i = 0; i < NO_OF_FILES; i++) {
		hdfs.delete(filesCreated[i], false);
	}
}

Source File: ContinuousFileProcessingTest.java From Flink-CEPplus with Apache License 2.0

5 votes

/**
 * Create continuous monitoring function with 1 reader-parallelism and interval: {@link #INTERVAL}.
 */
private <OUT> ContinuousFileMonitoringFunction<OUT> createTestContinuousFileMonitoringFunction(FileInputFormat<OUT> format, FileProcessingMode fileProcessingMode) {
	ContinuousFileMonitoringFunction<OUT> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format, fileProcessingMode, 1, INTERVAL);
	monitoringFunction.setRuntimeContext(Mockito.mock(RuntimeContext.class));
	return monitoringFunction;
}

Source File: ContinuousFileProcessingTest.java From Flink-CEPplus with Apache License 2.0

5 votes

FileVerifyingSourceContext(OneShotLatch latch,
						ContinuousFileMonitoringFunction src,
						int elementsBeforeNotifying,
						int elementsBeforeCanceling) {
	this.latch = latch;
	this.seenFiles = new TreeSet<>();
	this.src = src;
	this.elementsBeforeNotifying = elementsBeforeNotifying;
	this.elementsBeforeCanceling = elementsBeforeCanceling;
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

5 votes

FileVerifyingSourceContext(OneShotLatch latch,
						ContinuousFileMonitoringFunction src,
						int elementsBeforeNotifying,
						int elementsBeforeCanceling) {
	this.latch = latch;
	this.seenFiles = new TreeSet<>();
	this.src = src;
	this.elementsBeforeNotifying = elementsBeforeNotifying;
	this.elementsBeforeCanceling = elementsBeforeCanceling;
}

Source File: ContinuousFileProcessingTest.java From Flink-CEPplus with Apache License 2.0

5 votes

@Test
public void testSortingOnModTime() throws Exception {
	String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";

	final long[] modTimes = new long[NO_OF_FILES];
	final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];

	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> file =
			createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
		Thread.sleep(400);

		filesCreated[i] = file.f0;
		modTimes[i] = hdfs.getFileStatus(file.f0).getModificationTime();
	}

	TextInputFormat format = new TextInputFormat(new Path(testBasePath));
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	// this is just to verify that all splits have been forwarded later.
	FileInputSplit[] splits = format.createInputSplits(1);

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_ONCE);

	ModTimeVerifyingSourceContext context = new ModTimeVerifyingSourceContext(modTimes);

	monitoringFunction.open(new Configuration());
	monitoringFunction.run(context);
	Assert.assertEquals(splits.length, context.getCounter());

	// delete the created files.
	for (int i = 0; i < NO_OF_FILES; i++) {
		hdfs.delete(filesCreated[i], false);
	}
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

5 votes

/**
 * Create continuous monitoring function with 1 reader-parallelism and interval: {@link #INTERVAL}.
 */
private <OUT> ContinuousFileMonitoringFunction<OUT> createTestContinuousFileMonitoringFunction(FileInputFormat<OUT> format, FileProcessingMode fileProcessingMode) {
	ContinuousFileMonitoringFunction<OUT> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format, fileProcessingMode, 1, INTERVAL);
	monitoringFunction.setRuntimeContext(Mockito.mock(RuntimeContext.class));
	return monitoringFunction;
}

Source File: StreamExecutionEnvironment.java From flink with Apache License 2.0

5 votes

private <OUT> DataStreamSource<OUT> createFileInput(FileInputFormat<OUT> inputFormat,
													TypeInformation<OUT> typeInfo,
													String sourceName,
													FileProcessingMode monitoringMode,
													long interval) {

	Preconditions.checkNotNull(inputFormat, "Unspecified file input format.");
	Preconditions.checkNotNull(typeInfo, "Unspecified output type information.");
	Preconditions.checkNotNull(sourceName, "Unspecified name for the source.");
	Preconditions.checkNotNull(monitoringMode, "Unspecified monitoring mode.");

	Preconditions.checkArgument(monitoringMode.equals(FileProcessingMode.PROCESS_ONCE) ||
			interval >= ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL,
		"The path monitoring interval cannot be less than " +
				ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL + " ms.");

	ContinuousFileMonitoringFunction<OUT> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(inputFormat, monitoringMode, getParallelism(), interval);

	ContinuousFileReaderOperatorFactory<OUT, TimestampedFileInputSplit> factory =
			new ContinuousFileReaderOperatorFactory<>(inputFormat);

	SingleOutputStreamOperator<OUT> source = addSource(monitoringFunction, sourceName)
			.transform("Split Reader: " + sourceName, typeInfo, factory);

	return new DataStreamSource<>(source);
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

4 votes

@Test
public void testFunctionRestore() throws Exception {
	String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";

	org.apache.hadoop.fs.Path path = null;
	long fileModTime = Long.MIN_VALUE;
	for (int i = 0; i < 1; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
		path = file.f0;
		fileModTime = hdfs.getFileStatus(file.f0).getModificationTime();
	}

	TextInputFormat format = new TextInputFormat(new Path(testBasePath));

	final ContinuousFileMonitoringFunction<String> monitoringFunction =
		createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_CONTINUOUSLY);

	StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> src =
		new StreamSource<>(monitoringFunction);

	final AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarness =
		new AbstractStreamOperatorTestHarness<>(src, 1, 1, 0);
	testHarness.open();

	final Throwable[] error = new Throwable[1];

	final OneShotLatch latch = new OneShotLatch();

	final DummySourceContext sourceContext = new DummySourceContext() {
		@Override
		public void collect(TimestampedFileInputSplit element) {
			latch.trigger();
		}
	};

	// run the source asynchronously
	Thread runner = new Thread() {
		@Override
		public void run() {
			try {
				monitoringFunction.run(sourceContext);
			}
			catch (Throwable t) {
				t.printStackTrace();
				error[0] = t;
			}
		}
	};
	runner.start();

	// first condition for the source to have updated its state: emit at least one element
	if (!latch.isTriggered()) {
		latch.await();
	}

	// second condition for the source to have updated its state: it's not on the lock anymore,
	// this means it has processed all the splits and updated its state.
	synchronized (sourceContext.getCheckpointLock()) {}

	OperatorSubtaskState snapshot = testHarness.snapshot(0, 0);
	monitoringFunction.cancel();
	runner.join();

	testHarness.close();

	final ContinuousFileMonitoringFunction<String> monitoringFunctionCopy =
		createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_CONTINUOUSLY);

	StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> srcCopy =
		new StreamSource<>(monitoringFunctionCopy);

	AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarnessCopy =
		new AbstractStreamOperatorTestHarness<>(srcCopy, 1, 1, 0);
	testHarnessCopy.initializeState(snapshot);
	testHarnessCopy.open();

	Assert.assertNull(error[0]);
	Assert.assertEquals(fileModTime, monitoringFunctionCopy.getGlobalModificationTime());

	hdfs.delete(path, false);
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

4 votes

@Test
public void testProcessOnce() throws Exception {
	String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";

	final OneShotLatch latch = new OneShotLatch();

	// create a single file in the directory
	Tuple2<org.apache.hadoop.fs.Path, String> bootstrap =
		createFileAndFillWithData(testBasePath, "file", NO_OF_FILES + 1, "This is test line.");
	Assert.assertTrue(hdfs.exists(bootstrap.f0));

	// the source is supposed to read only this file.
	final Set<String> filesToBeRead = new TreeSet<>();
	filesToBeRead.add(bootstrap.f0.getName());

	TextInputFormat format = new TextInputFormat(new Path(testBasePath));
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	final ContinuousFileMonitoringFunction<String> monitoringFunction =
		createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_ONCE);

	final FileVerifyingSourceContext context = new FileVerifyingSourceContext(latch, monitoringFunction);

	final Thread t = new Thread() {
		@Override
		public void run() {
			try {
				monitoringFunction.open(new Configuration());
				monitoringFunction.run(context);

				// we would never arrive here if we were in
				// PROCESS_CONTINUOUSLY mode.

				// this will trigger the latch
				context.close();

			} catch (Exception e) {
				Assert.fail(e.getMessage());
			}
		}
	};
	t.start();

	if (!latch.isTriggered()) {
		latch.await();
	}

	// create some additional files that should be processed in the case of PROCESS_CONTINUOUSLY
	final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> ignoredFile =
			createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
		filesCreated[i] = ignoredFile.f0;
	}

	// wait until the monitoring thread exits
	t.join();

	Assert.assertArrayEquals(filesToBeRead.toArray(), context.getSeenFiles().toArray());

	// finally delete the files created for the test.
	hdfs.delete(bootstrap.f0, false);
	for (org.apache.hadoop.fs.Path path: filesCreated) {
		hdfs.delete(path, false);
	}
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

4 votes

@Test
public void testFunctionRestore() throws Exception {
	String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";

	org.apache.hadoop.fs.Path path = null;
	long fileModTime = Long.MIN_VALUE;
	for (int i = 0; i < 1; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> file = createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
		path = file.f0;
		fileModTime = hdfs.getFileStatus(file.f0).getModificationTime();
	}

	TextInputFormat format = new TextInputFormat(new Path(testBasePath));

	final ContinuousFileMonitoringFunction<String> monitoringFunction =
		createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_CONTINUOUSLY);

	StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> src =
		new StreamSource<>(monitoringFunction);

	final AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarness =
		new AbstractStreamOperatorTestHarness<>(src, 1, 1, 0);
	testHarness.open();

	final Throwable[] error = new Throwable[1];

	final OneShotLatch latch = new OneShotLatch();

	final DummySourceContext sourceContext = new DummySourceContext() {
		@Override
		public void collect(TimestampedFileInputSplit element) {
			latch.trigger();
		}
	};

	// run the source asynchronously
	Thread runner = new Thread() {
		@Override
		public void run() {
			try {
				monitoringFunction.run(sourceContext);
			}
			catch (Throwable t) {
				t.printStackTrace();
				error[0] = t;
			}
		}
	};
	runner.start();

	// first condition for the source to have updated its state: emit at least one element
	if (!latch.isTriggered()) {
		latch.await();
	}

	// second condition for the source to have updated its state: it's not on the lock anymore,
	// this means it has processed all the splits and updated its state.
	synchronized (sourceContext.getCheckpointLock()) {}

	OperatorSubtaskState snapshot = testHarness.snapshot(0, 0);
	monitoringFunction.cancel();
	runner.join();

	testHarness.close();

	final ContinuousFileMonitoringFunction<String> monitoringFunctionCopy =
		createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_CONTINUOUSLY);

	StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> srcCopy =
		new StreamSource<>(monitoringFunctionCopy);

	AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarnessCopy =
		new AbstractStreamOperatorTestHarness<>(srcCopy, 1, 1, 0);
	testHarnessCopy.initializeState(snapshot);
	testHarnessCopy.open();

	Assert.assertNull(error[0]);
	Assert.assertEquals(fileModTime, monitoringFunctionCopy.getGlobalModificationTime());

	hdfs.delete(path, false);
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

4 votes

@Test
public void testProcessContinuously() throws Exception {
	String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";

	final OneShotLatch latch = new OneShotLatch();

	// create a single file in the directory
	Tuple2<org.apache.hadoop.fs.Path, String> bootstrap =
		createFileAndFillWithData(testBasePath, "file", NO_OF_FILES + 1, "This is test line.");
	Assert.assertTrue(hdfs.exists(bootstrap.f0));

	final Set<String> filesToBeRead = new TreeSet<>();
	filesToBeRead.add(bootstrap.f0.getName());

	TextInputFormat format = new TextInputFormat(new Path(testBasePath));
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	final ContinuousFileMonitoringFunction<String> monitoringFunction =
		createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_CONTINUOUSLY);

	final int totalNoOfFilesToBeRead = NO_OF_FILES + 1; // 1 for the bootstrap + NO_OF_FILES
	final FileVerifyingSourceContext context = new FileVerifyingSourceContext(latch,
		monitoringFunction, 1, totalNoOfFilesToBeRead);

	final Thread t = new Thread() {

		@Override
		public void run() {
			try {
				monitoringFunction.open(new Configuration());
				monitoringFunction.run(context);
			} catch (Exception e) {
				Assert.fail(e.getMessage());
			}
		}
	};
	t.start();

	if (!latch.isTriggered()) {
		latch.await();
	}

	// create some additional files that will be processed in the case of PROCESS_CONTINUOUSLY
	final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> file =
			createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
		filesCreated[i] = file.f0;
		filesToBeRead.add(file.f0.getName());
	}

	// wait until the monitoring thread exits
	t.join();

	Assert.assertArrayEquals(filesToBeRead.toArray(), context.getSeenFiles().toArray());

	// finally delete the files created for the test.
	hdfs.delete(bootstrap.f0, false);
	for (org.apache.hadoop.fs.Path path: filesCreated) {
		hdfs.delete(path, false);
	}
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

4 votes

FileVerifyingSourceContext(OneShotLatch latch, ContinuousFileMonitoringFunction src) {
	this(latch, src, -1, -1);
}

Source File: ContinuousFileProcessingMigrationTest.java From flink with Apache License 2.0

4 votes

/**
 * Manually run this to write binary snapshot data. Remove @Ignore to run.
 */
@Ignore
@Test
public void writeMonitoringSourceSnapshot() throws Exception {

	File testFolder = tempFolder.newFolder();

	long fileModTime = Long.MIN_VALUE;
	for (int i = 0; i < 1; i++) {
		Tuple2<File, String> file = createFileAndFillWithData(testFolder, "file", i, "This is test line.");
		fileModTime = file.f0.lastModified();
	}

	TextInputFormat format = new TextInputFormat(new Path(testFolder.getAbsolutePath()));

	final ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);

	StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> src =
		new StreamSource<>(monitoringFunction);

	final AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarness =
			new AbstractStreamOperatorTestHarness<>(src, 1, 1, 0);

	testHarness.open();

	final Throwable[] error = new Throwable[1];

	final OneShotLatch latch = new OneShotLatch();

	// run the source asynchronously
	Thread runner = new Thread() {
		@Override
		public void run() {
			try {
				monitoringFunction.run(new DummySourceContext() {
					@Override
					public void collect(TimestampedFileInputSplit element) {
						latch.trigger();
					}

					@Override
					public void markAsTemporarilyIdle() {

					}
				});
			}
			catch (Throwable t) {
				t.printStackTrace();
				error[0] = t;
			}
		}
	};
	runner.start();

	if (!latch.isTriggered()) {
		latch.await();
	}

	final OperatorSubtaskState snapshot;
	synchronized (testHarness.getCheckpointLock()) {
		snapshot = testHarness.snapshot(0L, 0L);
	}

	OperatorSnapshotUtil.writeStateHandle(
			snapshot,
			"src/test/resources/monitoring-function-migration-test-" + fileModTime + "-flink" + flinkGenerateSavepointVersion + "-snapshot");

	monitoringFunction.cancel();
	runner.join();

	testHarness.close();
}

Source File: ContinuousFileProcessingITCase.java From flink with Apache License 2.0

4 votes

@Test
public void testProgram() throws Exception {

	/*
	* This test checks the interplay between the monitor and the reader
	* and also the failExternally() functionality. To test the latter we
	* set the parallelism to 1 so that we have the chaining between the sink,
	* which throws the SuccessException to signal the end of the test, and the
	* reader.
	* */

	TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
	format.setFilePath(hdfsURI);
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	// create the stream execution environment with a parallelism > 1 to test
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(PARALLELISM);

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format,
			FileProcessingMode.PROCESS_CONTINUOUSLY,
			env.getParallelism(), INTERVAL);

	// the monitor has always DOP 1
	DataStream<TimestampedFileInputSplit> splits = env.addSource(monitoringFunction);
	Assert.assertEquals(1, splits.getParallelism());

	TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);

	// the readers can be multiple
	DataStream<String> content = splits.transform("FileSplitReader", typeInfo, new ContinuousFileReaderOperatorFactory<>(format));
	Assert.assertEquals(PARALLELISM, content.getParallelism());

	// finally for the sink we set the parallelism to 1 so that we can verify the output
	TestingSinkFunction sink = new TestingSinkFunction();
	content.addSink(sink).setParallelism(1);

	CompletableFuture<Void> jobFuture = new CompletableFuture<>();
	new Thread(() -> {
		try {
			env.execute("ContinuousFileProcessingITCase Job.");
			jobFuture.complete(null);
		} catch (Exception e) {
			if (ExceptionUtils.findThrowable(e, SuccessException.class).isPresent()) {
				jobFuture.complete(null);
			} else {
				jobFuture.completeExceptionally(e);
			}
		}
	}).start();

	// The modification time of the last created file.
	long lastCreatedModTime = Long.MIN_VALUE;

	// create the files to be read
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> tmpFile;
		long modTime;
		do {

			// give it some time so that the files have
			// different modification timestamps.
			Thread.sleep(50);

			tmpFile = fillWithData(hdfsURI, "file", i, "This is test line.");

			modTime = hdfs.getFileStatus(tmpFile.f0).getModificationTime();
			if (modTime <= lastCreatedModTime) {
				// delete the last created file to recreate it with a different timestamp
				hdfs.delete(tmpFile.f0, false);
			}
		} while (modTime <= lastCreatedModTime);
		lastCreatedModTime = modTime;

		// put the contents in the expected results list before the reader picks them
		// this is to guarantee that they are in before the reader finishes (avoid race conditions)
		expectedContents.put(i, tmpFile.f1);

		org.apache.hadoop.fs.Path file =
			new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
		hdfs.rename(tmpFile.f0, file);
		Assert.assertTrue(hdfs.exists(file));
	}

	jobFuture.get();
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

4 votes

FileVerifyingSourceContext(OneShotLatch latch, ContinuousFileMonitoringFunction src) {
	this(latch, src, -1, -1);
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

4 votes

@Test
public void testProcessContinuously() throws Exception {
	String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";

	final OneShotLatch latch = new OneShotLatch();

	// create a single file in the directory
	Tuple2<org.apache.hadoop.fs.Path, String> bootstrap =
		createFileAndFillWithData(testBasePath, "file", NO_OF_FILES + 1, "This is test line.");
	Assert.assertTrue(hdfs.exists(bootstrap.f0));

	final Set<String> filesToBeRead = new TreeSet<>();
	filesToBeRead.add(bootstrap.f0.getName());

	TextInputFormat format = new TextInputFormat(new Path(testBasePath));
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	final ContinuousFileMonitoringFunction<String> monitoringFunction =
		createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_CONTINUOUSLY);

	final int totalNoOfFilesToBeRead = NO_OF_FILES + 1; // 1 for the bootstrap + NO_OF_FILES
	final FileVerifyingSourceContext context = new FileVerifyingSourceContext(latch,
		monitoringFunction, 1, totalNoOfFilesToBeRead);

	final Thread t = new Thread() {

		@Override
		public void run() {
			try {
				monitoringFunction.open(new Configuration());
				monitoringFunction.run(context);
			} catch (Exception e) {
				Assert.fail(e.getMessage());
			}
		}
	};
	t.start();

	if (!latch.isTriggered()) {
		latch.await();
	}

	// create some additional files that will be processed in the case of PROCESS_CONTINUOUSLY
	final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> file =
			createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
		filesCreated[i] = file.f0;
		filesToBeRead.add(file.f0.getName());
	}

	// wait until the monitoring thread exits
	t.join();

	Assert.assertArrayEquals(filesToBeRead.toArray(), context.getSeenFiles().toArray());

	// finally delete the files created for the test.
	hdfs.delete(bootstrap.f0, false);
	for (org.apache.hadoop.fs.Path path: filesCreated) {
		hdfs.delete(path, false);
	}
}

Source File: ContinuousFileProcessingITCase.java From Flink-CEPplus with Apache License 2.0

4 votes

@Test
public void testProgram() throws Exception {

	/*
	* This test checks the interplay between the monitor and the reader
	* and also the failExternally() functionality. To test the latter we
	* set the parallelism to 1 so that we have the chaining between the sink,
	* which throws the SuccessException to signal the end of the test, and the
	* reader.
	* */

	TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
	format.setFilePath(hdfsURI);
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	// create the stream execution environment with a parallelism > 1 to test
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(PARALLELISM);

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format,
			FileProcessingMode.PROCESS_CONTINUOUSLY,
			env.getParallelism(), INTERVAL);

	// the monitor has always DOP 1
	DataStream<TimestampedFileInputSplit> splits = env.addSource(monitoringFunction);
	Assert.assertEquals(1, splits.getParallelism());

	ContinuousFileReaderOperator<String> reader = new ContinuousFileReaderOperator<>(format);
	TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);

	// the readers can be multiple
	DataStream<String> content = splits.transform("FileSplitReader", typeInfo, reader);
	Assert.assertEquals(PARALLELISM, content.getParallelism());

	// finally for the sink we set the parallelism to 1 so that we can verify the output
	TestingSinkFunction sink = new TestingSinkFunction();
	content.addSink(sink).setParallelism(1);

	Thread job = new Thread() {

		@Override
		public void run() {
			try {
				env.execute("ContinuousFileProcessingITCase Job.");
			} catch (Exception e) {
				Throwable th = e;
				for (int depth = 0; depth < 20; depth++) {
					if (th instanceof SuccessException) {
						return;
					} else if (th.getCause() != null) {
						th = th.getCause();
					} else {
						break;
					}
				}
				e.printStackTrace();
				Assert.fail(e.getMessage());
			}
		}
	};
	job.start();

	// The modification time of the last created file.
	long lastCreatedModTime = Long.MIN_VALUE;

	// create the files to be read
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> tmpFile;
		long modTime;
		do {

			// give it some time so that the files have
			// different modification timestamps.
			Thread.sleep(50);

			tmpFile = fillWithData(hdfsURI, "file", i, "This is test line.");

			modTime = hdfs.getFileStatus(tmpFile.f0).getModificationTime();
			if (modTime <= lastCreatedModTime) {
				// delete the last created file to recreate it with a different timestamp
				hdfs.delete(tmpFile.f0, false);
			}
		} while (modTime <= lastCreatedModTime);
		lastCreatedModTime = modTime;

		// put the contents in the expected results list before the reader picks them
		// this is to guarantee that they are in before the reader finishes (avoid race conditions)
		expectedContents.put(i, tmpFile.f1);

		org.apache.hadoop.fs.Path file =
			new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
		hdfs.rename(tmpFile.f0, file);
		Assert.assertTrue(hdfs.exists(file));
	}

	// wait for the job to finish.
	job.join();
}

Source File: ContinuousFileProcessingTest.java From flink with Apache License 2.0

4 votes

@Test
public void testProcessOnce() throws Exception {
	String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";

	final OneShotLatch latch = new OneShotLatch();

	// create a single file in the directory
	Tuple2<org.apache.hadoop.fs.Path, String> bootstrap =
		createFileAndFillWithData(testBasePath, "file", NO_OF_FILES + 1, "This is test line.");
	Assert.assertTrue(hdfs.exists(bootstrap.f0));

	// the source is supposed to read only this file.
	final Set<String> filesToBeRead = new TreeSet<>();
	filesToBeRead.add(bootstrap.f0.getName());

	TextInputFormat format = new TextInputFormat(new Path(testBasePath));
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	final ContinuousFileMonitoringFunction<String> monitoringFunction =
		createTestContinuousFileMonitoringFunction(format, FileProcessingMode.PROCESS_ONCE);

	final FileVerifyingSourceContext context = new FileVerifyingSourceContext(latch, monitoringFunction);

	final Thread t = new Thread() {
		@Override
		public void run() {
			try {
				monitoringFunction.open(new Configuration());
				monitoringFunction.run(context);

				// we would never arrive here if we were in
				// PROCESS_CONTINUOUSLY mode.

				// this will trigger the latch
				context.close();

			} catch (Exception e) {
				Assert.fail(e.getMessage());
			}
		}
	};
	t.start();

	if (!latch.isTriggered()) {
		latch.await();
	}

	// create some additional files that should be processed in the case of PROCESS_CONTINUOUSLY
	final org.apache.hadoop.fs.Path[] filesCreated = new org.apache.hadoop.fs.Path[NO_OF_FILES];
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> ignoredFile =
			createFileAndFillWithData(testBasePath, "file", i, "This is test line.");
		filesCreated[i] = ignoredFile.f0;
	}

	// wait until the monitoring thread exits
	t.join();

	Assert.assertArrayEquals(filesToBeRead.toArray(), context.getSeenFiles().toArray());

	// finally delete the files created for the test.
	hdfs.delete(bootstrap.f0, false);
	for (org.apache.hadoop.fs.Path path: filesCreated) {
		hdfs.delete(path, false);
	}
}

Source File: ContinuousFileProcessingMigrationTest.java From flink with Apache License 2.0

4 votes

/**
 * Manually run this to write binary snapshot data. Remove @Ignore to run.
 */
@Ignore
@Test
public void writeMonitoringSourceSnapshot() throws Exception {

	File testFolder = tempFolder.newFolder();

	long fileModTime = Long.MIN_VALUE;
	for (int i = 0; i < 1; i++) {
		Tuple2<File, String> file = createFileAndFillWithData(testFolder, "file", i, "This is test line.");
		fileModTime = file.f0.lastModified();
	}

	TextInputFormat format = new TextInputFormat(new Path(testFolder.getAbsolutePath()));

	final ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format, FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);

	StreamSource<TimestampedFileInputSplit, ContinuousFileMonitoringFunction<String>> src =
		new StreamSource<>(monitoringFunction);

	final AbstractStreamOperatorTestHarness<TimestampedFileInputSplit> testHarness =
			new AbstractStreamOperatorTestHarness<>(src, 1, 1, 0);

	testHarness.open();

	final Throwable[] error = new Throwable[1];

	final OneShotLatch latch = new OneShotLatch();

	// run the source asynchronously
	Thread runner = new Thread() {
		@Override
		public void run() {
			try {
				monitoringFunction.run(new DummySourceContext() {
					@Override
					public void collect(TimestampedFileInputSplit element) {
						latch.trigger();
					}

					@Override
					public void markAsTemporarilyIdle() {

					}
				});
			}
			catch (Throwable t) {
				t.printStackTrace();
				error[0] = t;
			}
		}
	};
	runner.start();

	if (!latch.isTriggered()) {
		latch.await();
	}

	final OperatorSubtaskState snapshot;
	synchronized (testHarness.getCheckpointLock()) {
		snapshot = testHarness.snapshot(0L, 0L);
	}

	OperatorSnapshotUtil.writeStateHandle(
			snapshot,
			"src/test/resources/monitoring-function-migration-test-" + fileModTime + "-flink" + flinkGenerateSavepointVersion + "-snapshot");

	monitoringFunction.cancel();
	runner.join();

	testHarness.close();
}

Source File: ContinuousFileProcessingITCase.java From flink with Apache License 2.0

4 votes

@Test
public void testProgram() throws Exception {

	/*
	* This test checks the interplay between the monitor and the reader
	* and also the failExternally() functionality. To test the latter we
	* set the parallelism to 1 so that we have the chaining between the sink,
	* which throws the SuccessException to signal the end of the test, and the
	* reader.
	* */

	TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
	format.setFilePath(hdfsURI);
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	// create the stream execution environment with a parallelism > 1 to test
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(PARALLELISM);

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format,
			FileProcessingMode.PROCESS_CONTINUOUSLY,
			env.getParallelism(), INTERVAL);

	// the monitor has always DOP 1
	DataStream<TimestampedFileInputSplit> splits = env.addSource(monitoringFunction);
	Assert.assertEquals(1, splits.getParallelism());

	ContinuousFileReaderOperator<String> reader = new ContinuousFileReaderOperator<>(format);
	TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);

	// the readers can be multiple
	DataStream<String> content = splits.transform("FileSplitReader", typeInfo, reader);
	Assert.assertEquals(PARALLELISM, content.getParallelism());

	// finally for the sink we set the parallelism to 1 so that we can verify the output
	TestingSinkFunction sink = new TestingSinkFunction();
	content.addSink(sink).setParallelism(1);

	Thread job = new Thread() {

		@Override
		public void run() {
			try {
				env.execute("ContinuousFileProcessingITCase Job.");
			} catch (Exception e) {
				Throwable th = e;
				for (int depth = 0; depth < 20; depth++) {
					if (th instanceof SuccessException) {
						return;
					} else if (th.getCause() != null) {
						th = th.getCause();
					} else {
						break;
					}
				}
				e.printStackTrace();
				Assert.fail(e.getMessage());
			}
		}
	};
	job.start();

	// The modification time of the last created file.
	long lastCreatedModTime = Long.MIN_VALUE;

	// create the files to be read
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> tmpFile;
		long modTime;
		do {

			// give it some time so that the files have
			// different modification timestamps.
			Thread.sleep(50);

			tmpFile = fillWithData(hdfsURI, "file", i, "This is test line.");

			modTime = hdfs.getFileStatus(tmpFile.f0).getModificationTime();
			if (modTime <= lastCreatedModTime) {
				// delete the last created file to recreate it with a different timestamp
				hdfs.delete(tmpFile.f0, false);
			}
		} while (modTime <= lastCreatedModTime);
		lastCreatedModTime = modTime;

		// put the contents in the expected results list before the reader picks them
		// this is to guarantee that they are in before the reader finishes (avoid race conditions)
		expectedContents.put(i, tmpFile.f1);

		org.apache.hadoop.fs.Path file =
			new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
		hdfs.rename(tmpFile.f0, file);
		Assert.assertTrue(hdfs.exists(file));
	}

	// wait for the job to finish.
	job.join();
}

Source File: ContinuousFileProcessingTest.java From Flink-CEPplus with Apache License 2.0

4 votes

FileVerifyingSourceContext(OneShotLatch latch, ContinuousFileMonitoringFunction src) {
	this(latch, src, -1, -1);
}

org.apache.flink.streaming.api.functions.source.ContinuousFileMonitoringFunction Java Examples