Java Code Examples for org.apache.flink.api.java.DataSet#filter()

The following examples show how to use org.apache.flink.api.java.DataSet#filter() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FilterITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilterOnCustomType() throws Exception {
	/*
	 * Test filter on custom type
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<CustomType> ds = CollectionDataSets.getCustomTypeDataSet(env);
	DataSet<CustomType> filterDs = ds.
			filter(new Filter6());
	List<CustomType> result = filterDs.collect();

	String expected = "3,3,Hello world, how are you?\n"
			+
			"3,4,I am fine.\n" +
			"3,5,Luke Skywalker\n";

	compareResultAsText(result, expected);
}
 
Example 2
Source File: FilterITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testAllRejectingFilter() throws Exception {
	/*
	 * Test all-rejecting filter.
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> filterDs = ds.
			filter(new Filter1());

	List<Tuple3<Integer, Long, String>> result = filterDs.collect();

	String expected = "\n";

	compareResultAsTuples(result, expected);
}
 
Example 3
Source File: FilterITCase.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilterBasicType() throws Exception {
	/*
	 * Test filter on basic type
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<String> ds = CollectionDataSets.getStringDataSet(env);
	DataSet<String> filterDs = ds.
			filter(new Filter5());
	List<String> result = filterDs.collect();

	String expected = "Hi\n" +
			"Hello\n" +
			"Hello world\n" +
			"Hello world, how are you?\n";

	compareResultAsText(result, expected);
}
 
Example 4
Source File: FilterITCase.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilterOnCustomType() throws Exception {
	/*
	 * Test filter on custom type
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<CustomType> ds = CollectionDataSets.getCustomTypeDataSet(env);
	DataSet<CustomType> filterDs = ds.
			filter(new Filter6());
	List<CustomType> result = filterDs.collect();

	String expected = "3,3,Hello world, how are you?\n"
			+
			"3,4,I am fine.\n" +
			"3,5,Luke Skywalker\n";

	compareResultAsText(result, expected);
}
 
Example 5
Source File: FilterITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilterOnCustomType() throws Exception {
	/*
	 * Test filter on custom type
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<CustomType> ds = CollectionDataSets.getCustomTypeDataSet(env);
	DataSet<CustomType> filterDs = ds.
			filter(new Filter6());
	List<CustomType> result = filterDs.collect();

	String expected = "3,3,Hello world, how are you?\n"
			+
			"3,4,I am fine.\n" +
			"3,5,Luke Skywalker\n";

	compareResultAsText(result, expected);
}
 
Example 6
Source File: FilterITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testFilterOnIntegerTupleField() throws Exception {
	/*
	 * Test filter on Integer tuple field.
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> filterDs = ds.
			filter(new Filter4());
	List<Tuple3<Integer, Long, String>> result = filterDs.collect();

	String expected = "2,2,Hello\n" +
			"4,3,Hello world, how are you?\n" +
			"6,3,Luke Skywalker\n" +
			"8,4,Comment#2\n" +
			"10,4,Comment#4\n" +
			"12,5,Comment#6\n" +
			"14,5,Comment#8\n" +
			"16,6,Comment#10\n" +
			"18,6,Comment#12\n" +
			"20,6,Comment#14\n";

	compareResultAsTuples(result, expected);
}
 
Example 7
Source File: FilterITCase.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Test
public void testAllRejectingFilter() throws Exception {
	/*
	 * Test all-rejecting filter.
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> filterDs = ds.
			filter(new Filter1());

	List<Tuple3<Integer, Long, String>> result = filterDs.collect();

	String expected = "\n";

	compareResultAsTuples(result, expected);
}
 
Example 8
Source File: FilterWithIndirection.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSet<String> input = env.fromElements("Please filter", "the words", "but not this");

	DataSet<String> output = input.filter(UtilFunctionWrapper.UtilFunction.getWordFilter());
	output.print();

	env.execute();
}
 
Example 9
Source File: FilterWithMethodReference.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSet<String> input = env.fromElements("Please filter", "the words", "but not this");

	FilterFunction<String> filter = WordFilter::filter;

	DataSet<String> output = input.filter(filter);
	output.print();

	env.execute();
}
 
Example 10
Source File: FilterITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testAllPassingFilter() throws Exception {
	/*
	 * Test all-passing filter.
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> filterDs = ds.
			filter(new Filter2());
	List<Tuple3<Integer, Long, String>> result = filterDs.collect();

	String expected = "1,1,Hi\n" +
			"2,2,Hello\n" +
			"3,2,Hello world\n" +
			"4,3,Hello world, how are you?\n" +
			"5,3,I am fine.\n" +
			"6,3,Luke Skywalker\n" +
			"7,4,Comment#1\n" +
			"8,4,Comment#2\n" +
			"9,4,Comment#3\n" +
			"10,4,Comment#4\n" +
			"11,5,Comment#5\n" +
			"12,5,Comment#6\n" +
			"13,5,Comment#7\n" +
			"14,5,Comment#8\n" +
			"15,5,Comment#9\n" +
			"16,6,Comment#10\n" +
			"17,6,Comment#11\n" +
			"18,6,Comment#12\n" +
			"19,6,Comment#13\n" +
			"20,6,Comment#14\n" +
			"21,6,Comment#15\n";

	compareResultAsTuples(result, expected);
}
 
Example 11
Source File: EmptyFieldsCountAccumulator.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void main(final String[] args) throws Exception {

		final ParameterTool params = ParameterTool.fromArgs(args);

		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// make parameters available in the web interface
		env.getConfig().setGlobalJobParameters(params);

		// get the data set
		final DataSet<StringTriple> file = getDataSet(env, params);

		// filter lines with empty fields
		final DataSet<StringTriple> filteredLines = file.filter(new EmptyFieldFilter());

		// Here, we could do further processing with the filtered lines...
		JobExecutionResult result;
		// output the filtered lines
		if (params.has("output")) {
			filteredLines.writeAsCsv(params.get("output"));
			// execute program
			result = env.execute("Accumulator example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			filteredLines.print();
			result = env.getLastJobExecutionResult();
		}

		// get the accumulator result via its registration key
		final List<Integer> emptyFields = result.getAccumulatorResult(EMPTY_FIELD_ACCUMULATOR);
		System.out.format("Number of detected empty fields per column: %s\n", emptyFields);
	}
 
Example 12
Source File: FilterWithLambda.java    From flink with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("Convert2MethodRef")
public static void main(String[] args) throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSet<String> input = env.fromElements("Please filter", "the words", "but not this");

	DataSet<String> output = input.filter((v) -> WordFilter.filter(v));
	output.print();

	env.execute();
}
 
Example 13
Source File: EmptyFieldsCountAccumulator.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
public static void main(final String[] args) throws Exception {

		final ParameterTool params = ParameterTool.fromArgs(args);

		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// make parameters available in the web interface
		env.getConfig().setGlobalJobParameters(params);

		// get the data set
		final DataSet<StringTriple> file = getDataSet(env, params);

		// filter lines with empty fields
		final DataSet<StringTriple> filteredLines = file.filter(new EmptyFieldFilter());

		// Here, we could do further processing with the filtered lines...
		JobExecutionResult result;
		// output the filtered lines
		if (params.has("output")) {
			filteredLines.writeAsCsv(params.get("output"));
			// execute program
			result = env.execute("Accumulator example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			filteredLines.print();
			result = env.getLastJobExecutionResult();
		}

		// get the accumulator result via its registration key
		final List<Integer> emptyFields = result.getAccumulatorResult(EMPTY_FIELD_ACCUMULATOR);
		System.out.format("Number of detected empty fields per column: %s\n", emptyFields);
	}
 
Example 14
Source File: FilterWithIndirection.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSet<String> input = env.fromElements("Please filter", "the words", "but not this");

	DataSet<String> output = input.filter(UtilFunctionWrapper.UtilFunction.getWordFilter());
	output.print();

	env.execute();
}
 
Example 15
Source File: FilterWithLambda.java    From flink with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("Convert2MethodRef")
public static void main(String[] args) throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSet<String> input = env.fromElements("Please filter", "the words", "but not this");

	DataSet<String> output = input.filter((v) -> WordFilter.filter(v));
	output.print();

	env.execute();
}
 
Example 16
Source File: EmptyFieldsCountAccumulator.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void main(final String[] args) throws Exception {

		final ParameterTool params = ParameterTool.fromArgs(args);

		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// make parameters available in the web interface
		env.getConfig().setGlobalJobParameters(params);

		// get the data set
		final DataSet<StringTriple> file = getDataSet(env, params);

		// filter lines with empty fields
		final DataSet<StringTriple> filteredLines = file.filter(new EmptyFieldFilter());

		// Here, we could do further processing with the filtered lines...
		JobExecutionResult result;
		// output the filtered lines
		if (params.has("output")) {
			filteredLines.writeAsCsv(params.get("output"));
			// execute program
			result = env.execute("Accumulator example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			filteredLines.print();
			result = env.getLastJobExecutionResult();
		}

		// get the accumulator result via its registration key
		final List<Integer> emptyFields = result.getAccumulatorResult(EMPTY_FIELD_ACCUMULATOR);
		System.out.format("Number of detected empty fields per column: %s\n", emptyFields);
	}
 
Example 17
Source File: FilterITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testAllPassingFilter() throws Exception {
	/*
	 * Test all-passing filter.
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> filterDs = ds.
			filter(new Filter2());
	List<Tuple3<Integer, Long, String>> result = filterDs.collect();

	String expected = "1,1,Hi\n" +
			"2,2,Hello\n" +
			"3,2,Hello world\n" +
			"4,3,Hello world, how are you?\n" +
			"5,3,I am fine.\n" +
			"6,3,Luke Skywalker\n" +
			"7,4,Comment#1\n" +
			"8,4,Comment#2\n" +
			"9,4,Comment#3\n" +
			"10,4,Comment#4\n" +
			"11,5,Comment#5\n" +
			"12,5,Comment#6\n" +
			"13,5,Comment#7\n" +
			"14,5,Comment#8\n" +
			"15,5,Comment#9\n" +
			"16,6,Comment#10\n" +
			"17,6,Comment#11\n" +
			"18,6,Comment#12\n" +
			"19,6,Comment#13\n" +
			"20,6,Comment#14\n" +
			"21,6,Comment#15\n";

	compareResultAsTuples(result, expected);
}
 
Example 18
Source File: WebLogAnalysis.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

		final ParameterTool params = ParameterTool.fromArgs(args);

		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		env.getConfig().setGlobalJobParameters(params);

		// get input data
		DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env, params);
		DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env, params);
		DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env, params);

		// Retain documents with keywords
		DataSet<Tuple1<String>> filterDocs = documents
				.filter(new FilterDocByKeyWords())
				.project(0);

		// Filter ranks by minimum rank
		DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks
				.filter(new FilterByRank());

		// Filter visits by visit date
		DataSet<Tuple1<String>> filterVisits = visits
				.filter(new FilterVisitsByDate())
				.project(0);

		// Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords
		DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks =
				filterDocs.join(filterRanks)
							.where(0).equalTo(1)
							.projectSecond(0, 1, 2);

		// Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain time
		DataSet<Tuple3<Integer, String, Integer>> result =
				joinDocsRanks.coGroup(filterVisits)
								.where(1).equalTo(0)
								.with(new AntiJoinVisits());

		// emit result
		if (params.has("output")) {
			result.writeAsCsv(params.get("output"), "\n", "|");
			// execute program
			env.execute("WebLogAnalysis Example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			result.print();
		}
	}
 
Example 19
Source File: TPCDSQuery55CSV.java    From parquet-flinktacular with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

		long startTime = System.currentTimeMillis();

		if (!parseParameters(args)) {
			return;
		}

		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// get input data
		DataSet<DateDim> dataDims = getDataDimDataSet(env);
		DataSet<Item> item = getItemDataSet(env);
		DataSet<StoreSales> storeSales = getStoreSalesDataSet(env);

		dataDims = dataDims.filter(
			new FilterFunction<DateDim>() {
				@Override
				public boolean filter(DateDim d) {
					return d.getD_moy() == 11L && d.getD_year() == 1999L;
				}
			});

		item = item.filter(
			new FilterFunction<Item>() {
				@Override
				public boolean filter(Item i) {
					return i.getI_manager_id() == 28L;
				}
			});

		dataDims.join(storeSales).where(0).equalTo(0).with(new DataDimAndStoreSales())
			.join(item).where(1).equalTo(0).with(new DataDimAndStoreSalesAndItems())
			.groupBy(1, 0).aggregate(Aggregations.SUM, 2)
			.print();

		// execute program
		env.execute("TPC-DS Query 55 Example with CSV input");


		System.out.println("Execution time: " + (System.currentTimeMillis() - startTime));
	}
 
Example 20
Source File: WebLogAnalysis.java    From flink with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

		final ParameterTool params = ParameterTool.fromArgs(args);

		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		env.getConfig().setGlobalJobParameters(params);

		// get input data
		DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env, params);
		DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env, params);
		DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env, params);

		// Retain documents with keywords
		DataSet<Tuple1<String>> filterDocs = documents
				.filter(new FilterDocByKeyWords())
				.project(0);

		// Filter ranks by minimum rank
		DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks
				.filter(new FilterByRank());

		// Filter visits by visit date
		DataSet<Tuple1<String>> filterVisits = visits
				.filter(new FilterVisitsByDate())
				.project(0);

		// Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords
		DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks =
				filterDocs.join(filterRanks)
							.where(0).equalTo(1)
							.projectSecond(0, 1, 2);

		// Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain time
		DataSet<Tuple3<Integer, String, Integer>> result =
				joinDocsRanks.coGroup(filterVisits)
								.where(1).equalTo(0)
								.with(new AntiJoinVisits());

		// emit result
		if (params.has("output")) {
			result.writeAsCsv(params.get("output"), "\n", "|");
			// execute program
			env.execute("WebLogAnalysis Example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			result.print();
		}
	}