Java Code Examples for org.apache.flink.api.java.typeutils.TypeExtractor#getInputFormatTypes()

The following examples show how to use org.apache.flink.api.java.typeutils.TypeExtractor#getInputFormatTypes() . These examples are extracted from open source projects. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
@Test
public void testTypeExtraction() {
	try {
		InputFormat<MyAvroType, ?> format = new AvroInputFormat<MyAvroType>(new Path("file:///ignore/this/file"), MyAvroType.class);

		TypeInformation<?> typeInfoDirect = TypeExtractor.getInputFormatTypes(format);

		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
		DataSet<MyAvroType> input = env.createInput(format);
		TypeInformation<?> typeInfoDataSet = input.getType();

		Assert.assertTrue(typeInfoDirect instanceof PojoTypeInfo);
		Assert.assertTrue(typeInfoDataSet instanceof PojoTypeInfo);

		Assert.assertEquals(MyAvroType.class, typeInfoDirect.getTypeClass());
		Assert.assertEquals(MyAvroType.class, typeInfoDataSet.getTypeClass());
	} catch (Exception e) {
		e.printStackTrace();
		Assert.fail(e.getMessage());
	}
}
 
Example 2
@Test
public void testTypeExtraction() {
	try {
		InputFormat<MyAvroType, ?> format = new AvroInputFormat<MyAvroType>(new Path("file:///ignore/this/file"), MyAvroType.class);

		TypeInformation<?> typeInfoDirect = TypeExtractor.getInputFormatTypes(format);

		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
		DataSet<MyAvroType> input = env.createInput(format);
		TypeInformation<?> typeInfoDataSet = input.getType();

		Assert.assertTrue(typeInfoDirect instanceof PojoTypeInfo);
		Assert.assertTrue(typeInfoDataSet instanceof PojoTypeInfo);

		Assert.assertEquals(MyAvroType.class, typeInfoDirect.getTypeClass());
		Assert.assertEquals(MyAvroType.class, typeInfoDataSet.getTypeClass());
	} catch (Exception e) {
		e.printStackTrace();
		Assert.fail(e.getMessage());
	}
}
 
Example 3
@Test
public void testTypeExtraction() {
	try {
		InputFormat<MyAvroType, ?> format = new AvroInputFormat<MyAvroType>(new Path("file:///ignore/this/file"), MyAvroType.class);

		TypeInformation<?> typeInfoDirect = TypeExtractor.getInputFormatTypes(format);

		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
		DataSet<MyAvroType> input = env.createInput(format);
		TypeInformation<?> typeInfoDataSet = input.getType();

		Assert.assertTrue(typeInfoDirect instanceof PojoTypeInfo);
		Assert.assertTrue(typeInfoDataSet instanceof PojoTypeInfo);

		Assert.assertEquals(MyAvroType.class, typeInfoDirect.getTypeClass());
		Assert.assertEquals(MyAvroType.class, typeInfoDataSet.getTypeClass());
	} catch (Exception e) {
		e.printStackTrace();
		Assert.fail(e.getMessage());
	}
}
 
Example 4
private OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, String> getTestHarness(
	BlockingFileInputFormat format,
	int noOfTasks,
	int taskIdx) throws Exception {
	OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, String> testHarness =
		new OneInputStreamOperatorTestHarness<>(
			new ContinuousFileReaderOperatorFactory<>(format, TypeExtractor.getInputFormatTypes(format), new ExecutionConfig()),
			maxParallelism,
			noOfTasks,
			taskIdx);
	testHarness.setTimeCharacteristic(TimeCharacteristic.EventTime);
	return testHarness;
}
 
Example 5
@Test
public void testProgram() throws Exception {

	/*
	* This test checks the interplay between the monitor and the reader
	* and also the failExternally() functionality. To test the latter we
	* set the parallelism to 1 so that we have the chaining between the sink,
	* which throws the SuccessException to signal the end of the test, and the
	* reader.
	* */

	TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
	format.setFilePath(hdfsURI);
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	// create the stream execution environment with a parallelism > 1 to test
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(PARALLELISM);

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format,
			FileProcessingMode.PROCESS_CONTINUOUSLY,
			env.getParallelism(), INTERVAL);

	// the monitor has always DOP 1
	DataStream<TimestampedFileInputSplit> splits = env.addSource(monitoringFunction);
	Assert.assertEquals(1, splits.getParallelism());

	ContinuousFileReaderOperator<String> reader = new ContinuousFileReaderOperator<>(format);
	TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);

	// the readers can be multiple
	DataStream<String> content = splits.transform("FileSplitReader", typeInfo, reader);
	Assert.assertEquals(PARALLELISM, content.getParallelism());

	// finally for the sink we set the parallelism to 1 so that we can verify the output
	TestingSinkFunction sink = new TestingSinkFunction();
	content.addSink(sink).setParallelism(1);

	Thread job = new Thread() {

		@Override
		public void run() {
			try {
				env.execute("ContinuousFileProcessingITCase Job.");
			} catch (Exception e) {
				Throwable th = e;
				for (int depth = 0; depth < 20; depth++) {
					if (th instanceof SuccessException) {
						return;
					} else if (th.getCause() != null) {
						th = th.getCause();
					} else {
						break;
					}
				}
				e.printStackTrace();
				Assert.fail(e.getMessage());
			}
		}
	};
	job.start();

	// The modification time of the last created file.
	long lastCreatedModTime = Long.MIN_VALUE;

	// create the files to be read
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> tmpFile;
		long modTime;
		do {

			// give it some time so that the files have
			// different modification timestamps.
			Thread.sleep(50);

			tmpFile = fillWithData(hdfsURI, "file", i, "This is test line.");

			modTime = hdfs.getFileStatus(tmpFile.f0).getModificationTime();
			if (modTime <= lastCreatedModTime) {
				// delete the last created file to recreate it with a different timestamp
				hdfs.delete(tmpFile.f0, false);
			}
		} while (modTime <= lastCreatedModTime);
		lastCreatedModTime = modTime;

		// put the contents in the expected results list before the reader picks them
		// this is to guarantee that they are in before the reader finishes (avoid race conditions)
		expectedContents.put(i, tmpFile.f1);

		org.apache.hadoop.fs.Path file =
			new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
		hdfs.rename(tmpFile.f0, file);
		Assert.assertTrue(hdfs.exists(file));
	}

	// wait for the job to finish.
	job.join();
}
 
Example 6
@Test
public void testReaderRestore() throws Exception {
	File testFolder = tempFolder.newFolder();

	final OneShotLatch latch = new OneShotLatch();

	BlockingFileInputFormat format = new BlockingFileInputFormat(latch, new Path(testFolder.getAbsolutePath()));
	TypeInformation<FileInputSplit> typeInfo = TypeExtractor.getInputFormatTypes(format);

	ContinuousFileReaderOperator<FileInputSplit> initReader = new ContinuousFileReaderOperator<>(format);
	initReader.setOutputType(typeInfo, new ExecutionConfig());

	OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, FileInputSplit> testHarness =
		new OneInputStreamOperatorTestHarness<>(initReader);
	testHarness.setTimeCharacteristic(TimeCharacteristic.EventTime);

	testHarness.setup();

	testHarness.initializeState(
		OperatorSnapshotUtil.getResourceFilename(
			"reader-migration-test-flink" + testMigrateVersion + "-snapshot"));

	testHarness.open();

	latch.trigger();

	// ... and wait for the operators to close gracefully

	synchronized (testHarness.getCheckpointLock()) {
		testHarness.close();
	}

	TimestampedFileInputSplit split1 =
			new TimestampedFileInputSplit(0, 3, new Path("test/test1"), 0, 100, null);

	TimestampedFileInputSplit split2 =
			new TimestampedFileInputSplit(10, 2, new Path("test/test2"), 101, 200, null);

	TimestampedFileInputSplit split3 =
			new TimestampedFileInputSplit(10, 1, new Path("test/test2"), 0, 100, null);

	TimestampedFileInputSplit split4 =
			new TimestampedFileInputSplit(11, 0, new Path("test/test3"), 0, 100, null);

	// compare if the results contain what they should contain and also if
	// they are the same, as they should.

	Assert.assertTrue(testHarness.getOutput().contains(new StreamRecord<>(split1)));
	Assert.assertTrue(testHarness.getOutput().contains(new StreamRecord<>(split2)));
	Assert.assertTrue(testHarness.getOutput().contains(new StreamRecord<>(split3)));
	Assert.assertTrue(testHarness.getOutput().contains(new StreamRecord<>(split4)));
}
 
Example 7
@Test
public void testReaderSnapshotRestore() throws Exception {
	String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";

	TimestampedFileInputSplit split1 =
		new TimestampedFileInputSplit(0, 3, new Path("test/test1"), 0, 100, null);

	TimestampedFileInputSplit split2 =
		new TimestampedFileInputSplit(10, 2, new Path("test/test2"), 101, 200, null);

	TimestampedFileInputSplit split3 =
		new TimestampedFileInputSplit(10, 1, new Path("test/test2"), 0, 100, null);

	TimestampedFileInputSplit split4 =
		new TimestampedFileInputSplit(11, 0, new Path("test/test3"), 0, 100, null);

	final OneShotLatch latch = new OneShotLatch();

	BlockingFileInputFormat format = new BlockingFileInputFormat(latch, new Path(testBasePath));
	TypeInformation<FileInputSplit> typeInfo = TypeExtractor.getInputFormatTypes(format);

	ContinuousFileReaderOperator<FileInputSplit> initReader = new ContinuousFileReaderOperator<>(format);
	initReader.setOutputType(typeInfo, new ExecutionConfig());

	OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, FileInputSplit> initTestInstance =
		new OneInputStreamOperatorTestHarness<>(initReader);
	initTestInstance.setTimeCharacteristic(TimeCharacteristic.EventTime);
	initTestInstance.open();

	// create some state in the reader
	initTestInstance.processElement(new StreamRecord<>(split1));
	initTestInstance.processElement(new StreamRecord<>(split2));
	initTestInstance.processElement(new StreamRecord<>(split3));
	initTestInstance.processElement(new StreamRecord<>(split4));

	// take a snapshot of the operator's state. This will be used
	// to initialize another reader and compare the results of the
	// two operators.

	final OperatorSubtaskState snapshot;
	synchronized (initTestInstance.getCheckpointLock()) {
		snapshot = initTestInstance.snapshot(0L, 0L);
	}

	ContinuousFileReaderOperator<FileInputSplit> restoredReader = new ContinuousFileReaderOperator<>(
		new BlockingFileInputFormat(latch, new Path(testBasePath)));
	restoredReader.setOutputType(typeInfo, new ExecutionConfig());

	OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, FileInputSplit> restoredTestInstance  =
		new OneInputStreamOperatorTestHarness<>(restoredReader);
	restoredTestInstance.setTimeCharacteristic(TimeCharacteristic.EventTime);

	restoredTestInstance.initializeState(snapshot);
	restoredTestInstance.open();

	// now let computation start
	latch.trigger();

	// ... and wait for the operators to close gracefully

	synchronized (initTestInstance.getCheckpointLock()) {
		initTestInstance.close();
	}

	synchronized (restoredTestInstance.getCheckpointLock()) {
		restoredTestInstance.close();
	}

	FileInputSplit fsSplit1 = createSplitFromTimestampedSplit(split1);
	FileInputSplit fsSplit2 = createSplitFromTimestampedSplit(split2);
	FileInputSplit fsSplit3 = createSplitFromTimestampedSplit(split3);
	FileInputSplit fsSplit4 = createSplitFromTimestampedSplit(split4);

	// compare if the results contain what they should contain and also if
	// they are the same, as they should.

	Assert.assertTrue(initTestInstance.getOutput().contains(new StreamRecord<>(fsSplit1)));
	Assert.assertTrue(initTestInstance.getOutput().contains(new StreamRecord<>(fsSplit2)));
	Assert.assertTrue(initTestInstance.getOutput().contains(new StreamRecord<>(fsSplit3)));
	Assert.assertTrue(initTestInstance.getOutput().contains(new StreamRecord<>(fsSplit4)));

	Assert.assertArrayEquals(
		initTestInstance.getOutput().toArray(),
		restoredTestInstance.getOutput().toArray()
	);
}
 
Example 8
/**
 * Reads the contents of the user-specified {@code filePath} based on the given {@link FileInputFormat}. Depending
 * on the provided {@link FileProcessingMode}.
 *
 * <p>See {@link #readFile(FileInputFormat, String, FileProcessingMode, long)}
 *
 * @param inputFormat
 * 		The input format used to create the data stream
 * @param filePath
 * 		The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path")
 * @param watchType
 * 		The mode in which the source should operate, i.e. monitor path and react to new data, or process once and exit
 * @param interval
 * 		In the case of periodic path monitoring, this specifies the interval (in millis) between consecutive path scans
 * @param filter
 * 		The files to be excluded from the processing
 * @param <OUT>
 * 		The type of the returned data stream
 * @return The data stream that represents the data read from the given file
 *
 * @deprecated Use {@link FileInputFormat#setFilesFilter(FilePathFilter)} to set a filter and
 * 		{@link StreamExecutionEnvironment#readFile(FileInputFormat, String, FileProcessingMode, long)}
 *
 */
@PublicEvolving
@Deprecated
public <OUT> DataStreamSource<OUT> readFile(FileInputFormat<OUT> inputFormat,
											String filePath,
											FileProcessingMode watchType,
											long interval,
											FilePathFilter filter) {
	inputFormat.setFilesFilter(filter);

	TypeInformation<OUT> typeInformation;
	try {
		typeInformation = TypeExtractor.getInputFormatTypes(inputFormat);
	} catch (Exception e) {
		throw new InvalidProgramException("The type returned by the input format could not be " +
				"automatically determined. Please specify the TypeInformation of the produced type " +
				"explicitly by using the 'createInput(InputFormat, TypeInformation)' method instead.");
	}
	return readFile(inputFormat, filePath, watchType, interval, typeInformation);
}
 
Example 9
/**
 * Reads the contents of the user-specified {@code filePath} based on the given {@link FileInputFormat}. Depending
 * on the provided {@link FileProcessingMode}, the source may periodically monitor (every {@code interval} ms) the path
 * for new data ({@link FileProcessingMode#PROCESS_CONTINUOUSLY}), or process once the data currently in the path and
 * exit ({@link FileProcessingMode#PROCESS_ONCE}). In addition, if the path contains files not to be processed, the user
 * can specify a custom {@link FilePathFilter}. As a default implementation you can use
 * {@link FilePathFilter#createDefaultFilter()}.
 *
 * <p>Since all data streams need specific information about their types, this method needs to determine the
 * type of the data produced by the input format. It will attempt to determine the data type by reflection,
 * unless the input format implements the {@link org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface.
 * In the latter case, this method will invoke the
 * {@link org.apache.flink.api.java.typeutils.ResultTypeQueryable#getProducedType()} method to determine data
 * type produced by the input format.
 *
 * <p><b>NOTES ON CHECKPOINTING: </b> If the {@code watchType} is set to {@link FileProcessingMode#PROCESS_ONCE},
 * the source monitors the path <b>once</b>, creates the {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits}
 * to be processed, forwards them to the downstream readers to read the actual data,
 * and exits, without waiting for the readers to finish reading. This implies that no more checkpoint barriers
 * are going to be forwarded after the source exits, thus having no checkpoints after that point.
 *
 * @param inputFormat
 * 		The input format used to create the data stream
 * @param filePath
 * 		The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path")
 * @param watchType
 * 		The mode in which the source should operate, i.e. monitor path and react to new data, or process once and exit
 * @param interval
 * 		In the case of periodic path monitoring, this specifies the interval (in millis) between consecutive path scans
 * @param <OUT>
 * 		The type of the returned data stream
 * @return The data stream that represents the data read from the given file
 */
@PublicEvolving
public <OUT> DataStreamSource<OUT> readFile(FileInputFormat<OUT> inputFormat,
											String filePath,
											FileProcessingMode watchType,
											long interval) {

	TypeInformation<OUT> typeInformation;
	try {
		typeInformation = TypeExtractor.getInputFormatTypes(inputFormat);
	} catch (Exception e) {
		throw new InvalidProgramException("The type returned by the input format could not be " +
				"automatically determined. Please specify the TypeInformation of the produced type " +
				"explicitly by using the 'createInput(InputFormat, TypeInformation)' method instead.");
	}
	return readFile(inputFormat, filePath, watchType, interval, typeInformation);
}
 
Example 10
@Test
public void testProgram() throws Exception {

	/*
	* This test checks the interplay between the monitor and the reader
	* and also the failExternally() functionality. To test the latter we
	* set the parallelism to 1 so that we have the chaining between the sink,
	* which throws the SuccessException to signal the end of the test, and the
	* reader.
	* */

	TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
	format.setFilePath(hdfsURI);
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	// create the stream execution environment with a parallelism > 1 to test
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(PARALLELISM);

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format,
			FileProcessingMode.PROCESS_CONTINUOUSLY,
			env.getParallelism(), INTERVAL);

	// the monitor has always DOP 1
	DataStream<TimestampedFileInputSplit> splits = env.addSource(monitoringFunction);
	Assert.assertEquals(1, splits.getParallelism());

	ContinuousFileReaderOperator<String> reader = new ContinuousFileReaderOperator<>(format);
	TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);

	// the readers can be multiple
	DataStream<String> content = splits.transform("FileSplitReader", typeInfo, reader);
	Assert.assertEquals(PARALLELISM, content.getParallelism());

	// finally for the sink we set the parallelism to 1 so that we can verify the output
	TestingSinkFunction sink = new TestingSinkFunction();
	content.addSink(sink).setParallelism(1);

	Thread job = new Thread() {

		@Override
		public void run() {
			try {
				env.execute("ContinuousFileProcessingITCase Job.");
			} catch (Exception e) {
				Throwable th = e;
				for (int depth = 0; depth < 20; depth++) {
					if (th instanceof SuccessException) {
						return;
					} else if (th.getCause() != null) {
						th = th.getCause();
					} else {
						break;
					}
				}
				e.printStackTrace();
				Assert.fail(e.getMessage());
			}
		}
	};
	job.start();

	// The modification time of the last created file.
	long lastCreatedModTime = Long.MIN_VALUE;

	// create the files to be read
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> tmpFile;
		long modTime;
		do {

			// give it some time so that the files have
			// different modification timestamps.
			Thread.sleep(50);

			tmpFile = fillWithData(hdfsURI, "file", i, "This is test line.");

			modTime = hdfs.getFileStatus(tmpFile.f0).getModificationTime();
			if (modTime <= lastCreatedModTime) {
				// delete the last created file to recreate it with a different timestamp
				hdfs.delete(tmpFile.f0, false);
			}
		} while (modTime <= lastCreatedModTime);
		lastCreatedModTime = modTime;

		// put the contents in the expected results list before the reader picks them
		// this is to guarantee that they are in before the reader finishes (avoid race conditions)
		expectedContents.put(i, tmpFile.f1);

		org.apache.hadoop.fs.Path file =
			new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
		hdfs.rename(tmpFile.f0, file);
		Assert.assertTrue(hdfs.exists(file));
	}

	// wait for the job to finish.
	job.join();
}
 
Example 11
/**
 * Manually run this to write binary snapshot data. Remove @Ignore to run.
 */
@Ignore
@Test
public void writeReaderSnapshot() throws Exception {

	File testFolder = tempFolder.newFolder();

	TimestampedFileInputSplit split1 =
			new TimestampedFileInputSplit(0, 3, new Path("test/test1"), 0, 100, null);

	TimestampedFileInputSplit split2 =
			new TimestampedFileInputSplit(10, 2, new Path("test/test2"), 101, 200, null);

	TimestampedFileInputSplit split3 =
			new TimestampedFileInputSplit(10, 1, new Path("test/test2"), 0, 100, null);

	TimestampedFileInputSplit split4 =
			new TimestampedFileInputSplit(11, 0, new Path("test/test3"), 0, 100, null);

	// this always blocks to ensure that the reader doesn't to any actual processing so that
	// we keep the state for the four splits
	final OneShotLatch blockingLatch = new OneShotLatch();
	BlockingFileInputFormat format = new BlockingFileInputFormat(blockingLatch, new Path(testFolder.getAbsolutePath()));

	TypeInformation<FileInputSplit> typeInfo = TypeExtractor.getInputFormatTypes(format);
	ContinuousFileReaderOperator<FileInputSplit> initReader = new ContinuousFileReaderOperator<>(
			format);
	initReader.setOutputType(typeInfo, new ExecutionConfig());
	OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, FileInputSplit> testHarness =
			new OneInputStreamOperatorTestHarness<>(initReader);
	testHarness.setTimeCharacteristic(TimeCharacteristic.EventTime);
	testHarness.open();
	// create some state in the reader
	testHarness.processElement(new StreamRecord<>(split1));
	testHarness.processElement(new StreamRecord<>(split2));
	testHarness.processElement(new StreamRecord<>(split3));
	testHarness.processElement(new StreamRecord<>(split4));
	// take a snapshot of the operator's state. This will be used
	// to initialize another reader and compare the results of the
	// two operators.

	final OperatorSubtaskState snapshot;
	synchronized (testHarness.getCheckpointLock()) {
		snapshot = testHarness.snapshot(0L, 0L);
	}

	OperatorSnapshotUtil.writeStateHandle(snapshot, "src/test/resources/reader-migration-test-flink" + flinkGenerateSavepointVersion + "-snapshot");
}
 
Example 12
@Test
public void testReaderRestore() throws Exception {
	File testFolder = tempFolder.newFolder();

	final OneShotLatch latch = new OneShotLatch();

	BlockingFileInputFormat format = new BlockingFileInputFormat(latch, new Path(testFolder.getAbsolutePath()));
	TypeInformation<FileInputSplit> typeInfo = TypeExtractor.getInputFormatTypes(format);

	ContinuousFileReaderOperator<FileInputSplit> initReader = new ContinuousFileReaderOperator<>(format);
	initReader.setOutputType(typeInfo, new ExecutionConfig());

	OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, FileInputSplit> testHarness =
		new OneInputStreamOperatorTestHarness<>(initReader);
	testHarness.setTimeCharacteristic(TimeCharacteristic.EventTime);

	testHarness.setup();

	testHarness.initializeState(
		OperatorSnapshotUtil.getResourceFilename(
			"reader-migration-test-flink" + testMigrateVersion + "-snapshot"));

	testHarness.open();

	latch.trigger();

	// ... and wait for the operators to close gracefully

	synchronized (testHarness.getCheckpointLock()) {
		testHarness.close();
	}

	TimestampedFileInputSplit split1 =
			new TimestampedFileInputSplit(0, 3, new Path("test/test1"), 0, 100, null);

	TimestampedFileInputSplit split2 =
			new TimestampedFileInputSplit(10, 2, new Path("test/test2"), 101, 200, null);

	TimestampedFileInputSplit split3 =
			new TimestampedFileInputSplit(10, 1, new Path("test/test2"), 0, 100, null);

	TimestampedFileInputSplit split4 =
			new TimestampedFileInputSplit(11, 0, new Path("test/test3"), 0, 100, null);

	// compare if the results contain what they should contain and also if
	// they are the same, as they should.

	Assert.assertTrue(testHarness.getOutput().contains(new StreamRecord<>(split1)));
	Assert.assertTrue(testHarness.getOutput().contains(new StreamRecord<>(split2)));
	Assert.assertTrue(testHarness.getOutput().contains(new StreamRecord<>(split3)));
	Assert.assertTrue(testHarness.getOutput().contains(new StreamRecord<>(split4)));
}
 
Example 13
@Test
public void testReaderSnapshotRestore() throws Exception {
	String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";

	TimestampedFileInputSplit split1 =
		new TimestampedFileInputSplit(0, 3, new Path("test/test1"), 0, 100, null);

	TimestampedFileInputSplit split2 =
		new TimestampedFileInputSplit(10, 2, new Path("test/test2"), 101, 200, null);

	TimestampedFileInputSplit split3 =
		new TimestampedFileInputSplit(10, 1, new Path("test/test2"), 0, 100, null);

	TimestampedFileInputSplit split4 =
		new TimestampedFileInputSplit(11, 0, new Path("test/test3"), 0, 100, null);

	final OneShotLatch latch = new OneShotLatch();

	BlockingFileInputFormat format = new BlockingFileInputFormat(latch, new Path(testBasePath));
	TypeInformation<FileInputSplit> typeInfo = TypeExtractor.getInputFormatTypes(format);

	ContinuousFileReaderOperator<FileInputSplit> initReader = new ContinuousFileReaderOperator<>(format);
	initReader.setOutputType(typeInfo, new ExecutionConfig());

	OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, FileInputSplit> initTestInstance =
		new OneInputStreamOperatorTestHarness<>(initReader);
	initTestInstance.setTimeCharacteristic(TimeCharacteristic.EventTime);
	initTestInstance.open();

	// create some state in the reader
	initTestInstance.processElement(new StreamRecord<>(split1));
	initTestInstance.processElement(new StreamRecord<>(split2));
	initTestInstance.processElement(new StreamRecord<>(split3));
	initTestInstance.processElement(new StreamRecord<>(split4));

	// take a snapshot of the operator's state. This will be used
	// to initialize another reader and compare the results of the
	// two operators.

	final OperatorSubtaskState snapshot;
	synchronized (initTestInstance.getCheckpointLock()) {
		snapshot = initTestInstance.snapshot(0L, 0L);
	}

	ContinuousFileReaderOperator<FileInputSplit> restoredReader = new ContinuousFileReaderOperator<>(
		new BlockingFileInputFormat(latch, new Path(testBasePath)));
	restoredReader.setOutputType(typeInfo, new ExecutionConfig());

	OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, FileInputSplit> restoredTestInstance  =
		new OneInputStreamOperatorTestHarness<>(restoredReader);
	restoredTestInstance.setTimeCharacteristic(TimeCharacteristic.EventTime);

	restoredTestInstance.initializeState(snapshot);
	restoredTestInstance.open();

	// now let computation start
	latch.trigger();

	// ... and wait for the operators to close gracefully

	synchronized (initTestInstance.getCheckpointLock()) {
		initTestInstance.close();
	}

	synchronized (restoredTestInstance.getCheckpointLock()) {
		restoredTestInstance.close();
	}

	FileInputSplit fsSplit1 = createSplitFromTimestampedSplit(split1);
	FileInputSplit fsSplit2 = createSplitFromTimestampedSplit(split2);
	FileInputSplit fsSplit3 = createSplitFromTimestampedSplit(split3);
	FileInputSplit fsSplit4 = createSplitFromTimestampedSplit(split4);

	// compare if the results contain what they should contain and also if
	// they are the same, as they should.

	Assert.assertTrue(initTestInstance.getOutput().contains(new StreamRecord<>(fsSplit1)));
	Assert.assertTrue(initTestInstance.getOutput().contains(new StreamRecord<>(fsSplit2)));
	Assert.assertTrue(initTestInstance.getOutput().contains(new StreamRecord<>(fsSplit3)));
	Assert.assertTrue(initTestInstance.getOutput().contains(new StreamRecord<>(fsSplit4)));

	Assert.assertArrayEquals(
		initTestInstance.getOutput().toArray(),
		restoredTestInstance.getOutput().toArray()
	);
}
 
Example 14
/**
 * Reads the contents of the user-specified {@code filePath} based on the given {@link FileInputFormat}. Depending
 * on the provided {@link FileProcessingMode}.
 *
 * <p>See {@link #readFile(FileInputFormat, String, FileProcessingMode, long)}
 *
 * @param inputFormat
 * 		The input format used to create the data stream
 * @param filePath
 * 		The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path")
 * @param watchType
 * 		The mode in which the source should operate, i.e. monitor path and react to new data, or process once and exit
 * @param interval
 * 		In the case of periodic path monitoring, this specifies the interval (in millis) between consecutive path scans
 * @param filter
 * 		The files to be excluded from the processing
 * @param <OUT>
 * 		The type of the returned data stream
 * @return The data stream that represents the data read from the given file
 *
 * @deprecated Use {@link FileInputFormat#setFilesFilter(FilePathFilter)} to set a filter and
 * 		{@link StreamExecutionEnvironment#readFile(FileInputFormat, String, FileProcessingMode, long)}
 *
 */
@PublicEvolving
@Deprecated
public <OUT> DataStreamSource<OUT> readFile(FileInputFormat<OUT> inputFormat,
											String filePath,
											FileProcessingMode watchType,
											long interval,
											FilePathFilter filter) {
	inputFormat.setFilesFilter(filter);

	TypeInformation<OUT> typeInformation;
	try {
		typeInformation = TypeExtractor.getInputFormatTypes(inputFormat);
	} catch (Exception e) {
		throw new InvalidProgramException("The type returned by the input format could not be " +
				"automatically determined. Please specify the TypeInformation of the produced type " +
				"explicitly by using the 'createInput(InputFormat, TypeInformation)' method instead.");
	}
	return readFile(inputFormat, filePath, watchType, interval, typeInformation);
}
 
Example 15
private <T> OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, T> createHarness(FileInputFormat<T> format) throws Exception {
	ExecutionConfig config = new ExecutionConfig();
	return new OneInputStreamOperatorTestHarness<>(
			new ContinuousFileReaderOperatorFactory<>(format, TypeExtractor.getInputFormatTypes(format), config),
			TypeExtractor.getForClass(TimestampedFileInputSplit.class).createSerializer(config));
}
 
Example 16
@Test
public void testProgram() throws Exception {

	/*
	* This test checks the interplay between the monitor and the reader
	* and also the failExternally() functionality. To test the latter we
	* set the parallelism to 1 so that we have the chaining between the sink,
	* which throws the SuccessException to signal the end of the test, and the
	* reader.
	* */

	TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
	format.setFilePath(hdfsURI);
	format.setFilesFilter(FilePathFilter.createDefaultFilter());

	// create the stream execution environment with a parallelism > 1 to test
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(PARALLELISM);

	ContinuousFileMonitoringFunction<String> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format,
			FileProcessingMode.PROCESS_CONTINUOUSLY,
			env.getParallelism(), INTERVAL);

	// the monitor has always DOP 1
	DataStream<TimestampedFileInputSplit> splits = env.addSource(monitoringFunction);
	Assert.assertEquals(1, splits.getParallelism());

	TypeInformation<String> typeInfo = TypeExtractor.getInputFormatTypes(format);

	// the readers can be multiple
	DataStream<String> content = splits.transform("FileSplitReader", typeInfo, new ContinuousFileReaderOperatorFactory<>(format));
	Assert.assertEquals(PARALLELISM, content.getParallelism());

	// finally for the sink we set the parallelism to 1 so that we can verify the output
	TestingSinkFunction sink = new TestingSinkFunction();
	content.addSink(sink).setParallelism(1);

	CompletableFuture<Void> jobFuture = new CompletableFuture<>();
	new Thread(() -> {
		try {
			env.execute("ContinuousFileProcessingITCase Job.");
			jobFuture.complete(null);
		} catch (Exception e) {
			if (ExceptionUtils.findThrowable(e, SuccessException.class).isPresent()) {
				jobFuture.complete(null);
			} else {
				jobFuture.completeExceptionally(e);
			}
		}
	}).start();

	// The modification time of the last created file.
	long lastCreatedModTime = Long.MIN_VALUE;

	// create the files to be read
	for (int i = 0; i < NO_OF_FILES; i++) {
		Tuple2<org.apache.hadoop.fs.Path, String> tmpFile;
		long modTime;
		do {

			// give it some time so that the files have
			// different modification timestamps.
			Thread.sleep(50);

			tmpFile = fillWithData(hdfsURI, "file", i, "This is test line.");

			modTime = hdfs.getFileStatus(tmpFile.f0).getModificationTime();
			if (modTime <= lastCreatedModTime) {
				// delete the last created file to recreate it with a different timestamp
				hdfs.delete(tmpFile.f0, false);
			}
		} while (modTime <= lastCreatedModTime);
		lastCreatedModTime = modTime;

		// put the contents in the expected results list before the reader picks them
		// this is to guarantee that they are in before the reader finishes (avoid race conditions)
		expectedContents.put(i, tmpFile.f1);

		org.apache.hadoop.fs.Path file =
			new org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
		hdfs.rename(tmpFile.f0, file);
		Assert.assertTrue(hdfs.exists(file));
	}

	jobFuture.get();
}
 
Example 17
@Test
public void testReaderRestore() throws Exception {
	File testFolder = tempFolder.newFolder();

	final OneShotLatch latch = new OneShotLatch();

	BlockingFileInputFormat format = new BlockingFileInputFormat(latch, new Path(testFolder.getAbsolutePath()));
	TypeInformation<FileInputSplit> typeInfo = TypeExtractor.getInputFormatTypes(format);

	OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, FileInputSplit> testHarness = createHarness(format);
	testHarness.setTimeCharacteristic(TimeCharacteristic.EventTime);

	testHarness.setup();

	testHarness.initializeState(
		OperatorSnapshotUtil.getResourceFilename(
			"reader-migration-test-flink" + testMigrateVersion + "-snapshot"));

	testHarness.open();

	latch.trigger();

	// ... and wait for the operators to close gracefully

	synchronized (testHarness.getCheckpointLock()) {
		testHarness.close();
	}

	TimestampedFileInputSplit split1 =
			new TimestampedFileInputSplit(0, 3, new Path("test/test1"), 0, 100, null);

	TimestampedFileInputSplit split2 =
			new TimestampedFileInputSplit(10, 2, new Path("test/test2"), 101, 200, null);

	TimestampedFileInputSplit split3 =
			new TimestampedFileInputSplit(10, 1, new Path("test/test2"), 0, 100, null);

	TimestampedFileInputSplit split4 =
			new TimestampedFileInputSplit(11, 0, new Path("test/test3"), 0, 100, null);

	// compare if the results contain what they should contain and also if
	// they are the same, as they should.

	Assert.assertTrue(testHarness.getOutput().contains(new StreamRecord<>(split1)));
	Assert.assertTrue(testHarness.getOutput().contains(new StreamRecord<>(split2)));
	Assert.assertTrue(testHarness.getOutput().contains(new StreamRecord<>(split3)));
	Assert.assertTrue(testHarness.getOutput().contains(new StreamRecord<>(split4)));
}
 
Example 18
private OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, FileInputSplit> createHarness(BlockingFileInputFormat format) throws Exception {
	ExecutionConfig config = new ExecutionConfig();
	return new OneInputStreamOperatorTestHarness<>(
		new ContinuousFileReaderOperatorFactory(format, TypeExtractor.getInputFormatTypes(format), config),
		TypeExtractor.getForClass(TimestampedFileInputSplit.class).createSerializer(config));
}
 
Example 19
private <T> OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, T> createHarness(FileInputFormat<T> format) throws Exception {
	ExecutionConfig config = new ExecutionConfig();
	return new OneInputStreamOperatorTestHarness<>(
		new ContinuousFileReaderOperatorFactory(format, TypeExtractor.getInputFormatTypes(format), config),
		TypeExtractor.getForClass(TimestampedFileInputSplit.class).createSerializer(config));
}
 
Example 20
@Test
public void testReaderSnapshotRestore() throws Exception {
	String testBasePath = hdfsURI + "/" + UUID.randomUUID() + "/";

	TimestampedFileInputSplit split1 =
		new TimestampedFileInputSplit(0, 3, new Path("test/test1"), 0, 100, null);

	TimestampedFileInputSplit split2 =
		new TimestampedFileInputSplit(10, 2, new Path("test/test2"), 101, 200, null);

	TimestampedFileInputSplit split3 =
		new TimestampedFileInputSplit(10, 1, new Path("test/test2"), 0, 100, null);

	TimestampedFileInputSplit split4 =
		new TimestampedFileInputSplit(11, 0, new Path("test/test3"), 0, 100, null);

	final OneShotLatch latch = new OneShotLatch();

	BlockingFileInputFormat format = new BlockingFileInputFormat(latch, new Path(testBasePath));
	TypeInformation<FileInputSplit> typeInfo = TypeExtractor.getInputFormatTypes(format);

	OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, FileInputSplit> initTestInstance = createHarness(format);
	initTestInstance.setTimeCharacteristic(TimeCharacteristic.EventTime);
	initTestInstance.open();

	// create some state in the reader
	initTestInstance.processElement(new StreamRecord<>(split1));
	initTestInstance.processElement(new StreamRecord<>(split2));
	initTestInstance.processElement(new StreamRecord<>(split3));
	initTestInstance.processElement(new StreamRecord<>(split4));

	// take a snapshot of the operator's state. This will be used
	// to initialize another reader and compare the results of the
	// two operators.

	final OperatorSubtaskState snapshot;
	synchronized (initTestInstance.getCheckpointLock()) {
		snapshot = initTestInstance.snapshot(0L, 0L);
	}

	OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, FileInputSplit> restoredTestInstance  =
		createHarness(new BlockingFileInputFormat(latch, new Path(testBasePath)));
	restoredTestInstance.setTimeCharacteristic(TimeCharacteristic.EventTime);

	restoredTestInstance.initializeState(snapshot);
	restoredTestInstance.open();

	// now let computation start
	latch.trigger();

	// ... and wait for the operators to close gracefully

	synchronized (initTestInstance.getCheckpointLock()) {
		initTestInstance.close();
	}

	synchronized (restoredTestInstance.getCheckpointLock()) {
		restoredTestInstance.close();
	}

	FileInputSplit fsSplit1 = createSplitFromTimestampedSplit(split1);
	FileInputSplit fsSplit2 = createSplitFromTimestampedSplit(split2);
	FileInputSplit fsSplit3 = createSplitFromTimestampedSplit(split3);
	FileInputSplit fsSplit4 = createSplitFromTimestampedSplit(split4);

	// compare if the results contain what they should contain and also if
	// they are the same, as they should.

	Assert.assertTrue(initTestInstance.getOutput().contains(new StreamRecord<>(fsSplit1)));
	Assert.assertTrue(initTestInstance.getOutput().contains(new StreamRecord<>(fsSplit2)));
	Assert.assertTrue(initTestInstance.getOutput().contains(new StreamRecord<>(fsSplit3)));
	Assert.assertTrue(initTestInstance.getOutput().contains(new StreamRecord<>(fsSplit4)));

	Assert.assertArrayEquals(
		initTestInstance.getOutput().toArray(),
		restoredTestInstance.getOutput().toArray()
	);
}