org.apache.flink.api.common.io.FileInputFormat Java Examples

The following examples show how to use org.apache.flink.api.common.io.FileInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ExecutionEnvironment.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
public <X> DataSource<X> readFile(FileInputFormat<X> inputFormat, String filePath) {
	if (inputFormat == null) {
		throw new IllegalArgumentException("InputFormat must not be null.");
	}
	if (filePath == null) {
		throw new IllegalArgumentException("The file path must not be null.");
	}

	inputFormat.setFilePath(new Path(filePath));
	try {
		return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat));
	}
	catch (Exception e) {
		throw new InvalidProgramException("The type returned by the input format could not be automatically determined. " +
				"Please specify the TypeInformation of the produced type explicitly by using the " +
				"'createInput(InputFormat, TypeInformation)' method instead.");
	}
}
 
Example #2
Source File: ContinuousFileMonitoringFunction.java    From flink with Apache License 2.0 6 votes vote down vote up
public ContinuousFileMonitoringFunction(
	FileInputFormat<OUT> format,
	FileProcessingMode watchType,
	int readerParallelism,
	long interval,
	long globalModificationTime) {

	Preconditions.checkArgument(
		watchType == FileProcessingMode.PROCESS_ONCE || interval >= MIN_MONITORING_INTERVAL,
		"The specified monitoring interval (" + interval + " ms) is smaller than the minimum " +
			"allowed one (" + MIN_MONITORING_INTERVAL + " ms)."
	);

	Preconditions.checkArgument(
		format.getFilePaths().length == 1,
		"FileInputFormats with multiple paths are not supported yet.");

	this.format = Preconditions.checkNotNull(format, "Unspecified File Input Format.");
	this.path = Preconditions.checkNotNull(format.getFilePaths()[0].toString(), "Unspecified Path.");

	this.interval = interval;
	this.watchType = watchType;
	this.readerParallelism = Math.max(readerParallelism, 1);
	this.globalModificationTime = globalModificationTime;
}
 
Example #3
Source File: ExecutionEnvironment.java    From flink with Apache License 2.0 6 votes vote down vote up
public <X> DataSource<X> readFile(FileInputFormat<X> inputFormat, String filePath) {
	if (inputFormat == null) {
		throw new IllegalArgumentException("InputFormat must not be null.");
	}
	if (filePath == null) {
		throw new IllegalArgumentException("The file path must not be null.");
	}

	inputFormat.setFilePath(new Path(filePath));
	try {
		return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat));
	}
	catch (Exception e) {
		throw new InvalidProgramException("The type returned by the input format could not be automatically determined. " +
				"Please specify the TypeInformation of the produced type explicitly by using the " +
				"'createInput(InputFormat, TypeInformation)' method instead.");
	}
}
 
Example #4
Source File: ContinuousFileReaderOperator.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
private SplitReader(FileInputFormat<OT> format,
			TypeSerializer<OT> serializer,
			SourceFunction.SourceContext<OT> readerContext,
			Object checkpointLock,
			List<TimestampedFileInputSplit> restoredState) {

	this.format = checkNotNull(format, "Unspecified FileInputFormat.");
	this.serializer = checkNotNull(serializer, "Unspecified Serializer.");
	this.readerContext = checkNotNull(readerContext, "Unspecified Reader Context.");
	this.checkpointLock = checkNotNull(checkpointLock, "Unspecified checkpoint lock.");

	this.shouldClose = false;
	this.isRunning = true;

	this.pendingSplits = new PriorityQueue<>();

	// this is the case where a task recovers from a previous failed attempt
	if (restoredState != null) {
		this.pendingSplits.addAll(restoredState);
	}
}
 
Example #5
Source File: ContinuousFileMonitoringFunction.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
public ContinuousFileMonitoringFunction(
	FileInputFormat<OUT> format,
	FileProcessingMode watchType,
	int readerParallelism,
	long interval) {

	Preconditions.checkArgument(
		watchType == FileProcessingMode.PROCESS_ONCE || interval >= MIN_MONITORING_INTERVAL,
		"The specified monitoring interval (" + interval + " ms) is smaller than the minimum " +
			"allowed one (" + MIN_MONITORING_INTERVAL + " ms)."
	);

	Preconditions.checkArgument(
		format.getFilePaths().length == 1,
		"FileInputFormats with multiple paths are not supported yet.");

	this.format = Preconditions.checkNotNull(format, "Unspecified File Input Format.");
	this.path = Preconditions.checkNotNull(format.getFilePaths()[0].toString(), "Unspecified Path.");

	this.interval = interval;
	this.watchType = watchType;
	this.readerParallelism = Math.max(readerParallelism, 1);
	this.globalModificationTime = Long.MIN_VALUE;
}
 
Example #6
Source File: ExecutionEnvironment.java    From flink with Apache License 2.0 6 votes vote down vote up
public <X> DataSource<X> readFile(FileInputFormat<X> inputFormat, String filePath) {
	if (inputFormat == null) {
		throw new IllegalArgumentException("InputFormat must not be null.");
	}
	if (filePath == null) {
		throw new IllegalArgumentException("The file path must not be null.");
	}

	inputFormat.setFilePath(new Path(filePath));
	try {
		return createInput(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat));
	}
	catch (Exception e) {
		throw new InvalidProgramException("The type returned by the input format could not be automatically determined. " +
				"Please specify the TypeInformation of the produced type explicitly by using the " +
				"'createInput(InputFormat, TypeInformation)' method instead.");
	}
}
 
Example #7
Source File: ContinuousFileMonitoringFunction.java    From flink with Apache License 2.0 6 votes vote down vote up
public ContinuousFileMonitoringFunction(
	FileInputFormat<OUT> format,
	FileProcessingMode watchType,
	int readerParallelism,
	long interval) {

	Preconditions.checkArgument(
		watchType == FileProcessingMode.PROCESS_ONCE || interval >= MIN_MONITORING_INTERVAL,
		"The specified monitoring interval (" + interval + " ms) is smaller than the minimum " +
			"allowed one (" + MIN_MONITORING_INTERVAL + " ms)."
	);

	Preconditions.checkArgument(
		format.getFilePaths().length == 1,
		"FileInputFormats with multiple paths are not supported yet.");

	this.format = Preconditions.checkNotNull(format, "Unspecified File Input Format.");
	this.path = Preconditions.checkNotNull(format.getFilePaths()[0].toString(), "Unspecified Path.");

	this.interval = interval;
	this.watchType = watchType;
	this.readerParallelism = Math.max(readerParallelism, 1);
	this.globalModificationTime = Long.MIN_VALUE;
}
 
Example #8
Source File: ContinuousFileReaderOperator.java    From flink with Apache License 2.0 6 votes vote down vote up
private SplitReader(FileInputFormat<OT> format,
			TypeSerializer<OT> serializer,
			SourceFunction.SourceContext<OT> readerContext,
			Object checkpointLock,
			List<TimestampedFileInputSplit> restoredState) {

	this.format = checkNotNull(format, "Unspecified FileInputFormat.");
	this.serializer = checkNotNull(serializer, "Unspecified Serializer.");
	this.readerContext = checkNotNull(readerContext, "Unspecified Reader Context.");
	this.checkpointLock = checkNotNull(checkpointLock, "Unspecified checkpoint lock.");

	this.shouldClose = false;
	this.isRunning = true;

	this.pendingSplits = new PriorityQueue<>();

	// this is the case where a task recovers from a previous failed attempt
	if (restoredState != null) {
		this.pendingSplits.addAll(restoredState);
	}
}
 
Example #9
Source File: ContinuousFileProcessingTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Create continuous monitoring function with 1 reader-parallelism and interval: {@link #INTERVAL}.
 */
private <OUT> ContinuousFileMonitoringFunction<OUT> createTestContinuousFileMonitoringFunction(FileInputFormat<OUT> format, FileProcessingMode fileProcessingMode) {
	ContinuousFileMonitoringFunction<OUT> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format, fileProcessingMode, 1, INTERVAL);
	monitoringFunction.setRuntimeContext(Mockito.mock(RuntimeContext.class));
	return monitoringFunction;
}
 
Example #10
Source File: ContinuousFileReaderOperatorTest.java    From flink with Apache License 2.0 5 votes vote down vote up
private FileInputFormat<String> failingFormat() {
	return new FileInputFormat<String>() {
		@Override
		public boolean reachedEnd() {
			return false;
		}

		@Override
		public String nextRecord(String reuse) {
			throw new ExpectedTestException();
		}

		@Override
		public void open(FileInputSplit fileSplit) {
			throw new ExpectedTestException();
		}

		@Override
		public void close() {
			throw new ExpectedTestException();
		}

		@Override
		public void configure(Configuration parameters) {
		}
	};
}
 
Example #11
Source File: ContinuousFileMonitoringFunction.java    From flink with Apache License 2.0 5 votes vote down vote up
public ContinuousFileMonitoringFunction(
		FileInputFormat<OUT> format,
		FileProcessingMode watchType,
		int readerParallelism,
		long interval) {
	this(format, watchType, readerParallelism, interval, Long.MIN_VALUE);
}
 
Example #12
Source File: StreamExecutionEnvironment.java    From flink with Apache License 2.0 5 votes vote down vote up
private <OUT> DataStreamSource<OUT> createFileInput(FileInputFormat<OUT> inputFormat,
													TypeInformation<OUT> typeInfo,
													String sourceName,
													FileProcessingMode monitoringMode,
													long interval) {

	Preconditions.checkNotNull(inputFormat, "Unspecified file input format.");
	Preconditions.checkNotNull(typeInfo, "Unspecified output type information.");
	Preconditions.checkNotNull(sourceName, "Unspecified name for the source.");
	Preconditions.checkNotNull(monitoringMode, "Unspecified monitoring mode.");

	Preconditions.checkArgument(monitoringMode.equals(FileProcessingMode.PROCESS_ONCE) ||
			interval >= ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL,
		"The path monitoring interval cannot be less than " +
				ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL + " ms.");

	ContinuousFileMonitoringFunction<OUT> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(inputFormat, monitoringMode, getParallelism(), interval);

	ContinuousFileReaderOperatorFactory<OUT, TimestampedFileInputSplit> factory =
			new ContinuousFileReaderOperatorFactory<>(inputFormat);

	SingleOutputStreamOperator<OUT> source = addSource(monitoringFunction, sourceName)
			.transform("Split Reader: " + sourceName, typeInfo, factory);

	return new DataStreamSource<>(source);
}
 
Example #13
Source File: Utils.java    From flink with Apache License 2.0 5 votes vote down vote up
public static <OUT> OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, OUT> createContinuousFileProcessingTestHarness(
	FileInputFormat<OUT> inputFormat,
	TypeInformation<OUT> outTypeInfo,
	ExecutionConfig executionConfig) throws Exception {

	OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, OUT> testHarness =
		new OneInputStreamOperatorTestHarness<>(new ContinuousFileReaderOperatorFactory<>(inputFormat));
	testHarness.getOperatorFactory().setOutputType(
		outTypeInfo,
		executionConfig == null ? testHarness.getExecutionConfig() : executionConfig);

	return testHarness;
}
 
Example #14
Source File: ContinuousFileProcessingTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Create continuous monitoring function with 1 reader-parallelism and interval: {@link #INTERVAL}.
 */
private <OUT> ContinuousFileMonitoringFunction<OUT> createTestContinuousFileMonitoringFunction(FileInputFormat<OUT> format, FileProcessingMode fileProcessingMode) {
	ContinuousFileMonitoringFunction<OUT> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format, fileProcessingMode, 1, INTERVAL);
	monitoringFunction.setRuntimeContext(Mockito.mock(RuntimeContext.class));
	return monitoringFunction;
}
 
Example #15
Source File: StreamExecutionEnvironment.java    From flink with Apache License 2.0 5 votes vote down vote up
private <OUT> DataStreamSource<OUT> createFileInput(FileInputFormat<OUT> inputFormat,
													TypeInformation<OUT> typeInfo,
													String sourceName,
													FileProcessingMode monitoringMode,
													long interval) {

	Preconditions.checkNotNull(inputFormat, "Unspecified file input format.");
	Preconditions.checkNotNull(typeInfo, "Unspecified output type information.");
	Preconditions.checkNotNull(sourceName, "Unspecified name for the source.");
	Preconditions.checkNotNull(monitoringMode, "Unspecified monitoring mode.");

	Preconditions.checkArgument(monitoringMode.equals(FileProcessingMode.PROCESS_ONCE) ||
			interval >= ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL,
		"The path monitoring interval cannot be less than " +
				ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL + " ms.");

	ContinuousFileMonitoringFunction<OUT> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(inputFormat, monitoringMode, getParallelism(), interval);

	ContinuousFileReaderOperator<OUT> reader =
		new ContinuousFileReaderOperator<>(inputFormat);

	SingleOutputStreamOperator<OUT> source = addSource(monitoringFunction, sourceName)
			.transform("Split Reader: " + sourceName, typeInfo, reader);

	return new DataStreamSource<>(source);
}
 
Example #16
Source File: ContinuousFileProcessingTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Create continuous monitoring function with 1 reader-parallelism and interval: {@link #INTERVAL}.
 */
private <OUT> ContinuousFileMonitoringFunction<OUT> createTestContinuousFileMonitoringFunction(FileInputFormat<OUT> format, FileProcessingMode fileProcessingMode) {
	ContinuousFileMonitoringFunction<OUT> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(format, fileProcessingMode, 1, INTERVAL);
	monitoringFunction.setRuntimeContext(Mockito.mock(RuntimeContext.class));
	return monitoringFunction;
}
 
Example #17
Source File: StreamExecutionEnvironment.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private <OUT> DataStreamSource<OUT> createFileInput(FileInputFormat<OUT> inputFormat,
													TypeInformation<OUT> typeInfo,
													String sourceName,
													FileProcessingMode monitoringMode,
													long interval) {

	Preconditions.checkNotNull(inputFormat, "Unspecified file input format.");
	Preconditions.checkNotNull(typeInfo, "Unspecified output type information.");
	Preconditions.checkNotNull(sourceName, "Unspecified name for the source.");
	Preconditions.checkNotNull(monitoringMode, "Unspecified monitoring mode.");

	Preconditions.checkArgument(monitoringMode.equals(FileProcessingMode.PROCESS_ONCE) ||
			interval >= ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL,
		"The path monitoring interval cannot be less than " +
				ContinuousFileMonitoringFunction.MIN_MONITORING_INTERVAL + " ms.");

	ContinuousFileMonitoringFunction<OUT> monitoringFunction =
		new ContinuousFileMonitoringFunction<>(inputFormat, monitoringMode, getParallelism(), interval);

	ContinuousFileReaderOperator<OUT> reader =
		new ContinuousFileReaderOperator<>(inputFormat);

	SingleOutputStreamOperator<OUT> source = addSource(monitoringFunction, sourceName)
			.transform("Split Reader: " + sourceName, typeInfo, reader);

	return new DataStreamSource<>(source);
}
 
Example #18
Source File: ContinuousFileProcessingTest.java    From flink with Apache License 2.0 4 votes vote down vote up
private <T> OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, T> createHarness(FileInputFormat<T> format) throws Exception {
	ExecutionConfig config = new ExecutionConfig();
	return new OneInputStreamOperatorTestHarness<>(
		new ContinuousFileReaderOperatorFactory(format, TypeExtractor.getInputFormatTypes(format), config),
		TypeExtractor.getForClass(TimestampedFileInputSplit.class).createSerializer(config));
}
 
Example #19
Source File: ContinuousFileReaderOperatorTest.java    From flink with Apache License 2.0 4 votes vote down vote up
private <T> OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, T> createHarness(FileInputFormat<T> format) throws Exception {
	ExecutionConfig config = new ExecutionConfig();
	return new OneInputStreamOperatorTestHarness<>(
			new ContinuousFileReaderOperatorFactory<>(format, TypeExtractor.getInputFormatTypes(format), config),
			TypeExtractor.getForClass(TimestampedFileInputSplit.class).createSerializer(config));
}
 
Example #20
Source File: StreamExecutionEnvironment.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Reads the contents of the user-specified {@code filePath} based on the given {@link FileInputFormat}. Depending
 * on the provided {@link FileProcessingMode}.
 *
 * <p>See {@link #readFile(FileInputFormat, String, FileProcessingMode, long)}
 *
 * @param inputFormat
 * 		The input format used to create the data stream
 * @param filePath
 * 		The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path")
 * @param watchType
 * 		The mode in which the source should operate, i.e. monitor path and react to new data, or process once and exit
 * @param interval
 * 		In the case of periodic path monitoring, this specifies the interval (in millis) between consecutive path scans
 * @param filter
 * 		The files to be excluded from the processing
 * @param <OUT>
 * 		The type of the returned data stream
 * @return The data stream that represents the data read from the given file
 *
 * @deprecated Use {@link FileInputFormat#setFilesFilter(FilePathFilter)} to set a filter and
 * 		{@link StreamExecutionEnvironment#readFile(FileInputFormat, String, FileProcessingMode, long)}
 *
 */
@PublicEvolving
@Deprecated
public <OUT> DataStreamSource<OUT> readFile(FileInputFormat<OUT> inputFormat,
											String filePath,
											FileProcessingMode watchType,
											long interval,
											FilePathFilter filter) {
	inputFormat.setFilesFilter(filter);

	TypeInformation<OUT> typeInformation;
	try {
		typeInformation = TypeExtractor.getInputFormatTypes(inputFormat);
	} catch (Exception e) {
		throw new InvalidProgramException("The type returned by the input format could not be " +
				"automatically determined. Please specify the TypeInformation of the produced type " +
				"explicitly by using the 'createInput(InputFormat, TypeInformation)' method instead.");
	}
	return readFile(inputFormat, filePath, watchType, interval, typeInformation);
}
 
Example #21
Source File: StreamExecutionEnvironment.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Reads the contents of the user-specified {@code filePath} based on the given {@link FileInputFormat}. Depending
 * on the provided {@link FileProcessingMode}, the source may periodically monitor (every {@code interval} ms) the path
 * for new data ({@link FileProcessingMode#PROCESS_CONTINUOUSLY}), or process once the data currently in the path and
 * exit ({@link FileProcessingMode#PROCESS_ONCE}). In addition, if the path contains files not to be processed, the user
 * can specify a custom {@link FilePathFilter}. As a default implementation you can use
 * {@link FilePathFilter#createDefaultFilter()}.
 *
 * <p>Since all data streams need specific information about their types, this method needs to determine the
 * type of the data produced by the input format. It will attempt to determine the data type by reflection,
 * unless the input format implements the {@link org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface.
 * In the latter case, this method will invoke the
 * {@link org.apache.flink.api.java.typeutils.ResultTypeQueryable#getProducedType()} method to determine data
 * type produced by the input format.
 *
 * <p><b>NOTES ON CHECKPOINTING: </b> If the {@code watchType} is set to {@link FileProcessingMode#PROCESS_ONCE},
 * the source monitors the path <b>once</b>, creates the {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits}
 * to be processed, forwards them to the downstream {@link ContinuousFileReaderOperator readers} to read the actual data,
 * and exits, without waiting for the readers to finish reading. This implies that no more checkpoint barriers
 * are going to be forwarded after the source exits, thus having no checkpoints after that point.
 *
 * @param inputFormat
 * 		The input format used to create the data stream
 * @param filePath
 * 		The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path")
 * @param watchType
 * 		The mode in which the source should operate, i.e. monitor path and react to new data, or process once and exit
 * @param interval
 * 		In the case of periodic path monitoring, this specifies the interval (in millis) between consecutive path scans
 * @param <OUT>
 * 		The type of the returned data stream
 * @return The data stream that represents the data read from the given file
 */
@PublicEvolving
public <OUT> DataStreamSource<OUT> readFile(FileInputFormat<OUT> inputFormat,
											String filePath,
											FileProcessingMode watchType,
											long interval) {

	TypeInformation<OUT> typeInformation;
	try {
		typeInformation = TypeExtractor.getInputFormatTypes(inputFormat);
	} catch (Exception e) {
		throw new InvalidProgramException("The type returned by the input format could not be " +
				"automatically determined. Please specify the TypeInformation of the produced type " +
				"explicitly by using the 'createInput(InputFormat, TypeInformation)' method instead.");
	}
	return readFile(inputFormat, filePath, watchType, interval, typeInformation);
}
 
Example #22
Source File: StreamExecutionEnvironment.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Generic method to create an input data stream with {@link org.apache.flink.api.common.io.InputFormat}.
 *
 * <p>The data stream is typed to the given TypeInformation. This method is intended for input formats
 * where the return type cannot be determined by reflection analysis, and that do not implement the
 * {@link org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface.
 *
 * <p><b>NOTES ON CHECKPOINTING: </b> In the case of a {@link FileInputFormat}, the source
 * (which executes the {@link ContinuousFileMonitoringFunction}) monitors the path, creates the
 * {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards
 * them to the downstream readers to read the actual data, and exits,
 * without waiting for the readers to finish reading. This implies that no more checkpoint
 * barriers are going to be forwarded after the source exits, thus having no checkpoints.
 *
 * @param inputFormat
 * 		The input format used to create the data stream
 * @param typeInfo
 * 		The information about the type of the output type
 * @param <OUT>
 * 		The type of the returned data stream
 * @return The data stream that represents the data created by the input format
 */
@PublicEvolving
public <OUT> DataStreamSource<OUT> createInput(InputFormat<OUT, ?> inputFormat, TypeInformation<OUT> typeInfo) {
	DataStreamSource<OUT> source;

	if (inputFormat instanceof FileInputFormat) {
		@SuppressWarnings("unchecked")
		FileInputFormat<OUT> format = (FileInputFormat<OUT>) inputFormat;

		source = createFileInput(format, typeInfo, "Custom File source",
				FileProcessingMode.PROCESS_ONCE, -1);
	} else {
		source = createInput(inputFormat, typeInfo, "Custom Source");
	}
	return source;
}
 
Example #23
Source File: StreamExecutionEnvironment.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Reads the contents of the user-specified {@code filePath} based on the given {@link FileInputFormat}. Depending
 * on the provided {@link FileProcessingMode}, the source may periodically monitor (every {@code interval} ms) the path
 * for new data ({@link FileProcessingMode#PROCESS_CONTINUOUSLY}), or process once the data currently in the path and
 * exit ({@link FileProcessingMode#PROCESS_ONCE}). In addition, if the path contains files not to be processed, the user
 * can specify a custom {@link FilePathFilter}. As a default implementation you can use
 * {@link FilePathFilter#createDefaultFilter()}.
 *
 * <p>Since all data streams need specific information about their types, this method needs to determine the
 * type of the data produced by the input format. It will attempt to determine the data type by reflection,
 * unless the input format implements the {@link org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface.
 * In the latter case, this method will invoke the
 * {@link org.apache.flink.api.java.typeutils.ResultTypeQueryable#getProducedType()} method to determine data
 * type produced by the input format.
 *
 * <p><b>NOTES ON CHECKPOINTING: </b> If the {@code watchType} is set to {@link FileProcessingMode#PROCESS_ONCE},
 * the source monitors the path <b>once</b>, creates the {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits}
 * to be processed, forwards them to the downstream readers to read the actual data,
 * and exits, without waiting for the readers to finish reading. This implies that no more checkpoint barriers
 * are going to be forwarded after the source exits, thus having no checkpoints after that point.
 *
 * @param inputFormat
 * 		The input format used to create the data stream
 * @param filePath
 * 		The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path")
 * @param watchType
 * 		The mode in which the source should operate, i.e. monitor path and react to new data, or process once and exit
 * @param interval
 * 		In the case of periodic path monitoring, this specifies the interval (in millis) between consecutive path scans
 * @param <OUT>
 * 		The type of the returned data stream
 * @return The data stream that represents the data read from the given file
 */
@PublicEvolving
public <OUT> DataStreamSource<OUT> readFile(FileInputFormat<OUT> inputFormat,
											String filePath,
											FileProcessingMode watchType,
											long interval) {

	TypeInformation<OUT> typeInformation;
	try {
		typeInformation = TypeExtractor.getInputFormatTypes(inputFormat);
	} catch (Exception e) {
		throw new InvalidProgramException("The type returned by the input format could not be " +
				"automatically determined. Please specify the TypeInformation of the produced type " +
				"explicitly by using the 'createInput(InputFormat, TypeInformation)' method instead.");
	}
	return readFile(inputFormat, filePath, watchType, interval, typeInformation);
}
 
Example #24
Source File: StreamExecutionEnvironment.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Reads the contents of the user-specified {@code filePath} based on the given {@link FileInputFormat}. Depending
 * on the provided {@link FileProcessingMode}.
 *
 * <p>See {@link #readFile(FileInputFormat, String, FileProcessingMode, long)}
 *
 * @param inputFormat
 * 		The input format used to create the data stream
 * @param filePath
 * 		The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path")
 * @param watchType
 * 		The mode in which the source should operate, i.e. monitor path and react to new data, or process once and exit
 * @param interval
 * 		In the case of periodic path monitoring, this specifies the interval (in millis) between consecutive path scans
 * @param filter
 * 		The files to be excluded from the processing
 * @param <OUT>
 * 		The type of the returned data stream
 * @return The data stream that represents the data read from the given file
 *
 * @deprecated Use {@link FileInputFormat#setFilesFilter(FilePathFilter)} to set a filter and
 * 		{@link StreamExecutionEnvironment#readFile(FileInputFormat, String, FileProcessingMode, long)}
 *
 */
@PublicEvolving
@Deprecated
public <OUT> DataStreamSource<OUT> readFile(FileInputFormat<OUT> inputFormat,
											String filePath,
											FileProcessingMode watchType,
											long interval,
											FilePathFilter filter) {
	inputFormat.setFilesFilter(filter);

	TypeInformation<OUT> typeInformation;
	try {
		typeInformation = TypeExtractor.getInputFormatTypes(inputFormat);
	} catch (Exception e) {
		throw new InvalidProgramException("The type returned by the input format could not be " +
				"automatically determined. Please specify the TypeInformation of the produced type " +
				"explicitly by using the 'createInput(InputFormat, TypeInformation)' method instead.");
	}
	return readFile(inputFormat, filePath, watchType, interval, typeInformation);
}
 
Example #25
Source File: StreamExecutionEnvironment.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Generic method to create an input data stream with {@link org.apache.flink.api.common.io.InputFormat}.
 *
 * <p>The data stream is typed to the given TypeInformation. This method is intended for input formats
 * where the return type cannot be determined by reflection analysis, and that do not implement the
 * {@link org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface.
 *
 * <p><b>NOTES ON CHECKPOINTING: </b> In the case of a {@link FileInputFormat}, the source
 * (which executes the {@link ContinuousFileMonitoringFunction}) monitors the path, creates the
 * {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits} to be processed, forwards
 * them to the downstream {@link ContinuousFileReaderOperator} to read the actual data, and exits,
 * without waiting for the readers to finish reading. This implies that no more checkpoint
 * barriers are going to be forwarded after the source exits, thus having no checkpoints.
 *
 * @param inputFormat
 * 		The input format used to create the data stream
 * @param typeInfo
 * 		The information about the type of the output type
 * @param <OUT>
 * 		The type of the returned data stream
 * @return The data stream that represents the data created by the input format
 */
@PublicEvolving
public <OUT> DataStreamSource<OUT> createInput(InputFormat<OUT, ?> inputFormat, TypeInformation<OUT> typeInfo) {
	DataStreamSource<OUT> source;

	if (inputFormat instanceof FileInputFormat) {
		@SuppressWarnings("unchecked")
		FileInputFormat<OUT> format = (FileInputFormat<OUT>) inputFormat;

		source = createFileInput(format, typeInfo, "Custom File source",
				FileProcessingMode.PROCESS_ONCE, -1);
	} else {
		source = createInput(inputFormat, typeInfo, "Custom Source");
	}
	return source;
}
 
Example #26
Source File: Utils.java    From flink with Apache License 2.0 4 votes vote down vote up
public static <OUT> OneInputStreamOperatorTestHarness<TimestampedFileInputSplit, OUT> createContinuousFileProcessingTestHarness(
	FileInputFormat<OUT> inputFormat) throws Exception {

	return createContinuousFileProcessingTestHarness(inputFormat, TypeExtractor.getInputFormatTypes(inputFormat), null);
}
 
Example #27
Source File: StreamExecutionEnvironment.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Reads the contents of the user-specified {@code filePath} based on the given {@link FileInputFormat}. Depending
 * on the provided {@link FileProcessingMode}, the source may periodically monitor (every {@code interval} ms) the path
 * for new data ({@link FileProcessingMode#PROCESS_CONTINUOUSLY}), or process once the data currently in the path and
 * exit ({@link FileProcessingMode#PROCESS_ONCE}). In addition, if the path contains files not to be processed, the user
 * can specify a custom {@link FilePathFilter}. As a default implementation you can use
 * {@link FilePathFilter#createDefaultFilter()}.
 *
 * <p>Since all data streams need specific information about their types, this method needs to determine the
 * type of the data produced by the input format. It will attempt to determine the data type by reflection,
 * unless the input format implements the {@link org.apache.flink.api.java.typeutils.ResultTypeQueryable} interface.
 * In the latter case, this method will invoke the
 * {@link org.apache.flink.api.java.typeutils.ResultTypeQueryable#getProducedType()} method to determine data
 * type produced by the input format.
 *
 * <p><b>NOTES ON CHECKPOINTING: </b> If the {@code watchType} is set to {@link FileProcessingMode#PROCESS_ONCE},
 * the source monitors the path <b>once</b>, creates the {@link org.apache.flink.core.fs.FileInputSplit FileInputSplits}
 * to be processed, forwards them to the downstream {@link ContinuousFileReaderOperator readers} to read the actual data,
 * and exits, without waiting for the readers to finish reading. This implies that no more checkpoint barriers
 * are going to be forwarded after the source exits, thus having no checkpoints after that point.
 *
 * @param inputFormat
 * 		The input format used to create the data stream
 * @param filePath
 * 		The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path")
 * @param watchType
 * 		The mode in which the source should operate, i.e. monitor path and react to new data, or process once and exit
 * @param interval
 * 		In the case of periodic path monitoring, this specifies the interval (in millis) between consecutive path scans
 * @param <OUT>
 * 		The type of the returned data stream
 * @return The data stream that represents the data read from the given file
 */
@PublicEvolving
public <OUT> DataStreamSource<OUT> readFile(FileInputFormat<OUT> inputFormat,
											String filePath,
											FileProcessingMode watchType,
											long interval) {

	TypeInformation<OUT> typeInformation;
	try {
		typeInformation = TypeExtractor.getInputFormatTypes(inputFormat);
	} catch (Exception e) {
		throw new InvalidProgramException("The type returned by the input format could not be " +
				"automatically determined. Please specify the TypeInformation of the produced type " +
				"explicitly by using the 'createInput(InputFormat, TypeInformation)' method instead.");
	}
	return readFile(inputFormat, filePath, watchType, interval, typeInformation);
}
 
Example #28
Source File: ContinuousFileReaderOperator.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
public ContinuousFileReaderOperator(FileInputFormat<OUT> format) {
	this.format = checkNotNull(format);
}
 
Example #29
Source File: FlinkSequenceInputFormat.java    From incubator-retired-mrql with Apache License 2.0 4 votes vote down vote up
/** the Flink input format for this input */
public FileInputFormat<FData> inputFormat ( String path  ) {
    FDataInputFormat sf = new FDataInputFormat();
    sf.setFilePath(path.toString());
    return sf;
}
 
Example #30
Source File: FlinkMRQLFileInputFormat.java    From incubator-retired-mrql with Apache License 2.0 4 votes vote down vote up
/** the Flink input format for this input */
abstract public FileInputFormat<FData> inputFormat ( String path );