Java Code Examples for org.apache.flink.api.java.DataSet#flatMap()

The following examples show how to use org.apache.flink.api.java.DataSet#flatMap() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FlatMapITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testDataDuplicatingFlatMap() throws Exception {
	/*
	 * Test data duplicating flatmap
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<String> ds = CollectionDataSets.getStringDataSet(env);
	DataSet<String> duplicatingFlatMapDs = ds.
			flatMap(new FlatMapper2());

	List<String> result = duplicatingFlatMapDs.collect();

	String expected = "Hi\n" + "HI\n" +
			"Hello\n" + "HELLO\n" +
			"Hello world\n" + "HELLO WORLD\n" +
			"Hello world, how are you?\n" + "HELLO WORLD, HOW ARE YOU?\n" +
			"I am fine.\n" + "I AM FINE.\n" +
			"Luke Skywalker\n" + "LUKE SKYWALKER\n" +
			"Random comment\n" + "RANDOM COMMENT\n" +
			"LOL\n" + "LOL\n";

	compareResultAsText(result, expected);
}
 
Example 2
Source File: FlatMapITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testNonPassingFlatMap() throws Exception {
	/*
	 * Test non-passing flatmap
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<String> ds = CollectionDataSets.getStringDataSet(env);
	DataSet<String> nonPassingFlatMapDs = ds.
			flatMap(new FlatMapper1());

	List<String> result = nonPassingFlatMapDs.collect();

	String expected = "\n";

	compareResultAsText(result, expected);
}
 
Example 3
Source File: FlatMapITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testFlatMapperIfUDFReturnsInputObjectMultipleTimesWhileChangingIt() throws Exception {
	/*
	 * Test flatmapper if UDF returns input object
	 * multiple times and changes it in between
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> inputObjFlatMapDs = ds.
			flatMap(new FlatMapper6());

	List<Tuple3<Integer, Long, String>> result = inputObjFlatMapDs.collect();

	String expected = "0,1,Hi\n" +
			"0,2,Hello\n" + "1,2,Hello\n" +
			"0,2,Hello world\n" + "1,2,Hello world\n" + "2,2,Hello world\n" +
			"0,3,I am fine.\n" +
			"0,3,Luke Skywalker\n" + "1,3,Luke Skywalker\n" +
			"0,4,Comment#1\n" + "1,4,Comment#1\n" + "2,4,Comment#1\n" +
			"0,4,Comment#3\n" +
			"0,4,Comment#4\n" + "1,4,Comment#4\n" +
			"0,5,Comment#5\n" + "1,5,Comment#5\n" + "2,5,Comment#5\n" +
			"0,5,Comment#7\n" +
			"0,5,Comment#8\n" + "1,5,Comment#8\n" +
			"0,5,Comment#9\n" + "1,5,Comment#9\n" + "2,5,Comment#9\n" +
			"0,6,Comment#11\n" +
			"0,6,Comment#12\n" + "1,6,Comment#12\n" +
			"0,6,Comment#13\n" + "1,6,Comment#13\n" + "2,6,Comment#13\n" +
			"0,6,Comment#15\n";

	compareResultAsTuples(result, expected);
}
 
Example 4
Source File: HadoopMapFunctionITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testNonPassingMapper() throws Exception{
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env);
	DataSet<Tuple2<IntWritable, Text>> nonPassingFlatMapDs = ds.
			flatMap(new HadoopMapFunction<IntWritable, Text, IntWritable, Text>(new NonPassingMapper()));

	String resultPath = tempFolder.newFile().toURI().toString();

	nonPassingFlatMapDs.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE);
	env.execute();

	compareResultsByLinesInMemory("\n", resultPath);
}
 
Example 5
Source File: FlatMapITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testFlatMapWithVaryingNumberOfEmittedTuples() throws Exception {
	/*
	 * Test flatmap with varying number of emitted tuples
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> varyingTuplesMapDs = ds.
			flatMap(new FlatMapper3());

	List<Tuple3<Integer, Long, String>> result = varyingTuplesMapDs.collect();

	String expected = "1,1,Hi\n" +
			"2,2,Hello\n" + "2,2,Hello\n" +
			"4,3,Hello world, how are you?\n" +
			"5,3,I am fine.\n" + "5,3,I am fine.\n" +
			"7,4,Comment#1\n" +
			"8,4,Comment#2\n" + "8,4,Comment#2\n" +
			"10,4,Comment#4\n" +
			"11,5,Comment#5\n" + "11,5,Comment#5\n" +
			"13,5,Comment#7\n" +
			"14,5,Comment#8\n" + "14,5,Comment#8\n" +
			"16,6,Comment#10\n" +
			"17,6,Comment#11\n" + "17,6,Comment#11\n" +
			"19,6,Comment#13\n" +
			"20,6,Comment#14\n" + "20,6,Comment#14\n";

	compareResultAsTuples(result, expected);
}
 
Example 6
Source File: HadoopMapFunctionITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testNonPassingMapper() throws Exception{
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env);
	DataSet<Tuple2<IntWritable, Text>> nonPassingFlatMapDs = ds.
			flatMap(new HadoopMapFunction<IntWritable, Text, IntWritable, Text>(new NonPassingMapper()));

	String resultPath = tempFolder.newFile().toURI().toString();

	nonPassingFlatMapDs.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE);
	env.execute();

	compareResultsByLinesInMemory("\n", resultPath);
}
 
Example 7
Source File: FlatMapITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testTypeConversionFlatMapperCustomToTuple() throws Exception {
	/*
	 * Test type conversion flatmapper (Custom -> Tuple)
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<CustomType> ds = CollectionDataSets.getCustomTypeDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> typeConversionFlatMapDs = ds.
			flatMap(new FlatMapper4());

	List<Tuple3<Integer, Long, String>> result = typeConversionFlatMapDs.collect();

	String expected = "1,0,Hi\n" +
			"2,1,Hello\n" +
			"2,2,Hello world\n" +
			"3,3,Hello world, how are you?\n" +
			"3,4,I am fine.\n" +
			"3,5,Luke Skywalker\n" +
			"4,6,Comment#1\n" +
			"4,7,Comment#2\n" +
			"4,8,Comment#3\n" +
			"4,9,Comment#4\n" +
			"5,10,Comment#5\n" +
			"5,11,Comment#6\n" +
			"5,12,Comment#7\n" +
			"5,13,Comment#8\n" +
			"5,14,Comment#9\n" +
			"6,15,Comment#10\n" +
			"6,16,Comment#11\n" +
			"6,17,Comment#12\n" +
			"6,18,Comment#13\n" +
			"6,19,Comment#14\n" +
			"6,20,Comment#15\n";

	compareResultAsTuples(result, expected);
}
 
Example 8
Source File: FlatMapITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testFlatMapWithVaryingNumberOfEmittedTuples() throws Exception {
	/*
	 * Test flatmap with varying number of emitted tuples
	 */

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> varyingTuplesMapDs = ds.
			flatMap(new FlatMapper3());

	List<Tuple3<Integer, Long, String>> result = varyingTuplesMapDs.collect();

	String expected = "1,1,Hi\n" +
			"2,2,Hello\n" + "2,2,Hello\n" +
			"4,3,Hello world, how are you?\n" +
			"5,3,I am fine.\n" + "5,3,I am fine.\n" +
			"7,4,Comment#1\n" +
			"8,4,Comment#2\n" + "8,4,Comment#2\n" +
			"10,4,Comment#4\n" +
			"11,5,Comment#5\n" + "11,5,Comment#5\n" +
			"13,5,Comment#7\n" +
			"14,5,Comment#8\n" + "14,5,Comment#8\n" +
			"16,6,Comment#10\n" +
			"17,6,Comment#11\n" + "17,6,Comment#11\n" +
			"19,6,Comment#13\n" +
			"20,6,Comment#14\n" + "20,6,Comment#14\n";

	compareResultAsTuples(result, expected);
}
 
Example 9
Source File: WordCountWithCollectionITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<String> text = env.fromElements(WordCountData.TEXT);
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new WordCount.Tokenizer());
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	result.output(new LocalCollectionOutputFormat<Tuple2<String, Integer>>(resultsCollected));
	env.execute("Word Count Collection");
}
 
Example 10
Source File: WordCountWithCollectionITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<String> text = env.fromElements(WordCountData.TEXT);
	DataSet<Tuple2<String, Integer>> words = text.flatMap(new WordCount.Tokenizer());
	DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

	result.output(new LocalCollectionOutputFormat<Tuple2<String, Integer>>(resultsCollected));
	env.execute("Word Count Collection");
}
 
Example 11
Source File: HadoopMapFunctionITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testDataDuplicatingMapper() throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env);
	DataSet<Tuple2<IntWritable, Text>> duplicatingFlatMapDs = ds.
			flatMap(new HadoopMapFunction<IntWritable, Text, IntWritable, Text>(new DuplicatingMapper()));

	String resultPath = tempFolder.newFile().toURI().toString();

	duplicatingFlatMapDs.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE);
	env.execute();

	String expected = "(1,Hi)\n" + "(1,HI)\n" +
			"(2,Hello)\n" + "(2,HELLO)\n" +
			"(3,Hello world)\n" + "(3,HELLO WORLD)\n" +
			"(4,Hello world, how are you?)\n" + "(4,HELLO WORLD, HOW ARE YOU?)\n" +
			"(5,I am fine.)\n" + "(5,I AM FINE.)\n" +
			"(6,Luke Skywalker)\n" + "(6,LUKE SKYWALKER)\n" +
			"(7,Comment#1)\n" + "(7,COMMENT#1)\n" +
			"(8,Comment#2)\n" + "(8,COMMENT#2)\n" +
			"(9,Comment#3)\n" + "(9,COMMENT#3)\n" +
			"(10,Comment#4)\n" + "(10,COMMENT#4)\n" +
			"(11,Comment#5)\n" + "(11,COMMENT#5)\n" +
			"(12,Comment#6)\n" + "(12,COMMENT#6)\n" +
			"(13,Comment#7)\n" + "(13,COMMENT#7)\n" +
			"(14,Comment#8)\n" + "(14,COMMENT#8)\n" +
			"(15,Comment#9)\n" + "(15,COMMENT#9)\n" +
			"(16,Comment#10)\n" + "(16,COMMENT#10)\n" +
			"(17,Comment#11)\n" + "(17,COMMENT#11)\n" +
			"(18,Comment#12)\n" + "(18,COMMENT#12)\n" +
			"(19,Comment#13)\n" + "(19,COMMENT#13)\n" +
			"(20,Comment#14)\n" + "(20,COMMENT#14)\n" +
			"(21,Comment#15)\n" + "(21,COMMENT#15)\n";

	compareResultsByLinesInMemory(expected, resultPath);
}
 
Example 12
Source File: HadoopMapFunctionITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testDataDuplicatingMapper() throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple2<IntWritable, Text>> ds = HadoopTestData.getKVPairDataSet(env);
	DataSet<Tuple2<IntWritable, Text>> duplicatingFlatMapDs = ds.
			flatMap(new HadoopMapFunction<IntWritable, Text, IntWritable, Text>(new DuplicatingMapper()));

	String resultPath = tempFolder.newFile().toURI().toString();

	duplicatingFlatMapDs.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE);
	env.execute();

	String expected = "(1,Hi)\n" + "(1,HI)\n" +
			"(2,Hello)\n" + "(2,HELLO)\n" +
			"(3,Hello world)\n" + "(3,HELLO WORLD)\n" +
			"(4,Hello world, how are you?)\n" + "(4,HELLO WORLD, HOW ARE YOU?)\n" +
			"(5,I am fine.)\n" + "(5,I AM FINE.)\n" +
			"(6,Luke Skywalker)\n" + "(6,LUKE SKYWALKER)\n" +
			"(7,Comment#1)\n" + "(7,COMMENT#1)\n" +
			"(8,Comment#2)\n" + "(8,COMMENT#2)\n" +
			"(9,Comment#3)\n" + "(9,COMMENT#3)\n" +
			"(10,Comment#4)\n" + "(10,COMMENT#4)\n" +
			"(11,Comment#5)\n" + "(11,COMMENT#5)\n" +
			"(12,Comment#6)\n" + "(12,COMMENT#6)\n" +
			"(13,Comment#7)\n" + "(13,COMMENT#7)\n" +
			"(14,Comment#8)\n" + "(14,COMMENT#8)\n" +
			"(15,Comment#9)\n" + "(15,COMMENT#9)\n" +
			"(16,Comment#10)\n" + "(16,COMMENT#10)\n" +
			"(17,Comment#11)\n" + "(17,COMMENT#11)\n" +
			"(18,Comment#12)\n" + "(18,COMMENT#12)\n" +
			"(19,Comment#13)\n" + "(19,COMMENT#13)\n" +
			"(20,Comment#14)\n" + "(20,COMMENT#14)\n" +
			"(21,Comment#15)\n" + "(21,COMMENT#15)\n";

	compareResultsByLinesInMemory(expected, resultPath);
}
 
Example 13
Source File: GeoTempConverter.java    From OSTMap with Apache License 2.0 4 votes vote down vote up
/**
 * run conversion process
 * @param configPath path to config file
 * @throws Exception
 */
public void run(String configPath) throws Exception {

    FlinkEnvManager fem = new FlinkEnvManager(configPath, "GeoTimeConvJob",
            TableIdentifier.RAW_TWITTER_DATA.get(),
            TableIdentifier.GEO_TEMPORAL_INDEX.get());

    DataSet<Tuple2<Key,Value>> rawTwitterDataRows = fem.getDataFromAccumulo();

    DataSet<Tuple2<Text, Mutation>> geoTempMutations = rawTwitterDataRows
            .flatMap(new GeoTempFlatMap(TableIdentifier.GEO_TEMPORAL_INDEX.get()));

    geoTempMutations.output(fem.getHadoopOF());

    fem.getExecutionEnvironment().execute("GeoTimeConvProcess");

}
 
Example 14
Source File: CollectionExecutionIterationTest.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
@Test
public void testDeltaIteration() {
	try {
		ExecutionEnvironment env = ExecutionEnvironment.createCollectionsEnvironment();

		@SuppressWarnings("unchecked")
		DataSet<Tuple2<Integer, Integer>> solInput = env.fromElements(
				new Tuple2<Integer, Integer>(1, 0),
				new Tuple2<Integer, Integer>(2, 0),
				new Tuple2<Integer, Integer>(3, 0),
				new Tuple2<Integer, Integer>(4, 0));

		@SuppressWarnings("unchecked")
		DataSet<Tuple1<Integer>> workInput = env.fromElements(
				new Tuple1<Integer>(1),
				new Tuple1<Integer>(2),
				new Tuple1<Integer>(3),
				new Tuple1<Integer>(4));

		// Perform a delta iteration where we add those values to the workset where
		// the second tuple field is smaller than the first tuple field.
		// At the end both tuple fields must be the same.

		DeltaIteration<Tuple2<Integer, Integer>, Tuple1<Integer>> iteration =
			solInput.iterateDelta(workInput, 10, 0);

		DataSet<Tuple2<Integer, Integer>> solDelta = iteration.getSolutionSet().join(
				iteration.getWorkset()).where(0).equalTo(0).with(
				new JoinFunction<Tuple2<Integer, Integer>, Tuple1<Integer>, Tuple2<Integer, Integer>>() {

			@Override
			public Tuple2<Integer, Integer> join(Tuple2<Integer, Integer> first,
					Tuple1<Integer> second) throws Exception {
				return new Tuple2<Integer, Integer>(first.f0, first.f1 + 1);
			}
		});

		DataSet<Tuple1<Integer>> nextWorkset = solDelta.flatMap(
				new FlatMapFunction<Tuple2<Integer, Integer>, Tuple1<Integer>>() {
			@Override
			public void flatMap(Tuple2<Integer, Integer> in, Collector<Tuple1<Integer>>
					out) throws Exception {
				if (in.f1 < in.f0) {
					out.collect(new Tuple1<Integer>(in.f0));
				}
			}
		});

		List<Tuple2<Integer, Integer>> collected = new ArrayList<Tuple2<Integer, Integer>>();

		iteration.closeWith(solDelta, nextWorkset)
				.output(new LocalCollectionOutputFormat<Tuple2<Integer, Integer>>(collected));

		env.execute();

		// verify that both tuple fields are now the same
		for (Tuple2<Integer, Integer> t: collected) {
			assertEquals(t.f0, t.f1);
		}
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}
 
Example 15
Source File: ConnectedComponentsCoGroupTest.java    From flink with Apache License 2.0 4 votes vote down vote up
public static Plan connectedComponentsWithCoGroup(String[] args) throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(Integer.parseInt(args[0]));

	DataSet<Tuple1<Long>> initialVertices = env.readCsvFile(args[1]).types(Long.class).name(VERTEX_SOURCE);

	DataSet<Tuple2<Long, Long>> edges = env.readCsvFile(args[2]).types(Long.class, Long.class).name(EDGES_SOURCE);

	DataSet<Tuple2<Long, Long>> verticesWithId = initialVertices.flatMap(new DummyMapFunction());

	DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> iteration =
			verticesWithId.iterateDelta(verticesWithId, Integer.parseInt(args[4]), 0).name(ITERATION_NAME);

	DataSet<Tuple2<Long, Long>> joinWithNeighbors = iteration.getWorkset().join(edges)
			.where(0).equalTo(0)
			.with(new DummyJoinFunction()).name(JOIN_NEIGHBORS_MATCH);

	DataSet<Tuple2<Long, Long>> minAndUpdate = joinWithNeighbors.coGroup(iteration.getSolutionSet())
			.where(0).equalTo(0)
			.with(new DummyCoGroupFunction()).name(MIN_ID_AND_UPDATE);

	iteration.closeWith(minAndUpdate, minAndUpdate).writeAsCsv(args[3]).name(SINK);

	return env.createProgramPlan();
}
 
Example 16
Source File: CsvSourceBatchOp.java    From Alink with Apache License 2.0 4 votes vote down vote up
@Override
public Table initializeDataSource() {
    final String filePath = getFilePath();
    final String schemaStr = getSchemaStr();
    final String fieldDelim = getFieldDelimiter();
    final String rowDelim = getRowDelimiter();
    final Character quoteChar = getQuoteChar();
    final boolean skipBlankLine = getSkipBlankLine();

    final String[] colNames = CsvUtil.getColNames(schemaStr);
    final TypeInformation[] colTypes = CsvUtil.getColTypes(schemaStr);

    boolean ignoreFirstLine = getIgnoreFirstLine();
    String protocol = "";

    try {
        URL url = new URL(filePath);
        protocol = url.getProtocol();
    } catch (MalformedURLException ignored) {
    }

    DataSet<Row> rows;
    ExecutionEnvironment execEnv = MLEnvironmentFactory.get(getMLEnvironmentId()).getExecutionEnvironment();
    TableSchema dummySchema = new TableSchema(new String[]{"f1"}, new TypeInformation[]{Types.STRING});

    if (protocol.equalsIgnoreCase("http") || protocol.equalsIgnoreCase("https")) {
        HttpFileSplitReader reader = new HttpFileSplitReader(filePath);
        rows = execEnv
            .createInput(new GenericCsvInputFormat(reader, dummySchema.getFieldTypes(), rowDelim, rowDelim, ignoreFirstLine),
                new RowTypeInfo(dummySchema.getFieldTypes(), dummySchema.getFieldNames()))
            .name("http_csv_source");
    } else {
        RowCsvInputFormat inputFormat = new RowCsvInputFormat(
            new Path(filePath), dummySchema.getFieldTypes(), rowDelim, rowDelim, new int[]{0}, true);
        inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine);
        rows = execEnv.createInput(inputFormat).name("csv_source");
    }

    rows = rows.flatMap(new CsvUtil.ParseCsvFunc(colTypes, fieldDelim, quoteChar, skipBlankLine));

    return DataSetConversionUtil.toTable(getMLEnvironmentId(), rows, colNames, colTypes);
}
 
Example 17
Source File: CollectionExecutionIterationTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testDeltaIteration() {
	try {
		ExecutionEnvironment env = ExecutionEnvironment.createCollectionsEnvironment();

		@SuppressWarnings("unchecked")
		DataSet<Tuple2<Integer, Integer>> solInput = env.fromElements(
				new Tuple2<Integer, Integer>(1, 0),
				new Tuple2<Integer, Integer>(2, 0),
				new Tuple2<Integer, Integer>(3, 0),
				new Tuple2<Integer, Integer>(4, 0));

		@SuppressWarnings("unchecked")
		DataSet<Tuple1<Integer>> workInput = env.fromElements(
				new Tuple1<Integer>(1),
				new Tuple1<Integer>(2),
				new Tuple1<Integer>(3),
				new Tuple1<Integer>(4));

		// Perform a delta iteration where we add those values to the workset where
		// the second tuple field is smaller than the first tuple field.
		// At the end both tuple fields must be the same.

		DeltaIteration<Tuple2<Integer, Integer>, Tuple1<Integer>> iteration =
			solInput.iterateDelta(workInput, 10, 0);

		DataSet<Tuple2<Integer, Integer>> solDelta = iteration.getSolutionSet().join(
				iteration.getWorkset()).where(0).equalTo(0).with(
				new JoinFunction<Tuple2<Integer, Integer>, Tuple1<Integer>, Tuple2<Integer, Integer>>() {

			@Override
			public Tuple2<Integer, Integer> join(Tuple2<Integer, Integer> first,
					Tuple1<Integer> second) throws Exception {
				return new Tuple2<Integer, Integer>(first.f0, first.f1 + 1);
			}
		});

		DataSet<Tuple1<Integer>> nextWorkset = solDelta.flatMap(
				new FlatMapFunction<Tuple2<Integer, Integer>, Tuple1<Integer>>() {
			@Override
			public void flatMap(Tuple2<Integer, Integer> in, Collector<Tuple1<Integer>>
					out) throws Exception {
				if (in.f1 < in.f0) {
					out.collect(new Tuple1<Integer>(in.f0));
				}
			}
		});

		List<Tuple2<Integer, Integer>> collected = new ArrayList<Tuple2<Integer, Integer>>();

		iteration.closeWith(solDelta, nextWorkset)
				.output(new LocalCollectionOutputFormat<Tuple2<Integer, Integer>>(collected));

		env.execute();

		// verify that both tuple fields are now the same
		for (Tuple2<Integer, Integer> t: collected) {
			assertEquals(t.f0, t.f1);
		}
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}
 
Example 18
Source File: CorrelationBatchOp.java    From Alink with Apache License 2.0 4 votes vote down vote up
@Override
public CorrelationBatchOp linkFrom(BatchOperator<?>... inputs) {
    BatchOperator<?> in = checkAndGetFirst(inputs);

    String[] selectedColNames = this.getParams().get(SELECTED_COLS);

    if (selectedColNames == null) {
        selectedColNames = in.getColNames();
    }

    //check col types must be double or bigint
    TableUtil.assertNumericalCols(in.getSchema(), selectedColNames);

    Method corrType = getMethod();

    if (Method.PEARSON == corrType) {

        DataSet<Tuple2<TableSummary, CorrelationResult>> srt = StatisticsHelper.pearsonCorrelation(in, selectedColNames);

        DataSet<Row> result = srt.
            flatMap(new FlatMapFunction<Tuple2<TableSummary, CorrelationResult>, Row>() {
                @Override
                public void flatMap(Tuple2<TableSummary, CorrelationResult> summary, Collector<Row> collector) {
                    new CorrelationDataConverter().save(summary.f1, collector);
                }
            });


        this.setOutput(result, new CorrelationDataConverter().getModelSchema());
    } else {

        DataSet<Row> data = inputs[0].select(selectedColNames).getDataSet();
        DataSet<Row> rank = SpearmanCorrelation.calcRank(data, false);

        TypeInformation[] colTypes = new TypeInformation[selectedColNames.length];
        for (int i = 0; i < colTypes.length; i++) {
            colTypes[i] = Types.DOUBLE;
        }

        BatchOperator rankOp = new TableSourceBatchOp(DataSetConversionUtil.toTable(getMLEnvironmentId(), rank, selectedColNames, colTypes))
            .setMLEnvironmentId(getMLEnvironmentId());

        CorrelationBatchOp corrBatchOp = new CorrelationBatchOp()
            .setMLEnvironmentId(getMLEnvironmentId())
            .setSelectedCols(selectedColNames);

        rankOp.link(corrBatchOp);

        this.setOutput(corrBatchOp.getDataSet(), corrBatchOp.getSchema());

    }

    return this;
}
 
Example 19
Source File: Calculator.java    From OSTMap with Apache License 2.0 3 votes vote down vote up
/**
 * run area calculation process
 * @param path path to config file
 * @throws Exception
 */
public void run(String path) throws Exception {

    readConfig(path);

    FlinkEnvManager fem = new FlinkEnvManager(path, "areaJob",
            TableIdentifier.RAW_TWITTER_DATA.get(),
            "HighScore");

    DataSet<Tuple2<Key,Value>> rawTwitterDataRows = fem.getDataFromAccumulo();

    DataSet<Tuple2<String,String>> geoList = rawTwitterDataRows.flatMap(new GeoExtrationFlatMap());

    DataSet<Tuple2<String,String>> reducedGroup = geoList
                                                    .groupBy(0)
                                                    .reduceGroup(new CoordGroupReduce());

    DataSet<Tuple3<String,Double,Integer>> userRanking = reducedGroup.flatMap(new GeoCalcFlatMap())
            .sortPartition(1, Order.DESCENDING).setParallelism(1);

    DataSet<Tuple2<Text,Mutation>> topTen = userRanking
            .groupBy(2)
            .reduceGroup(new TopTenGroupReduce("ac"));

    topTen.output(fem.getHadoopOF());

    fem.getExecutionEnvironment().execute("AreaProcess");

    TextOutputFormat<String> tof = new TextOutputFormat<>(new Path("file:///tmp/areauserranking"));
    tof.setWriteMode(FileSystem.WriteMode.OVERWRITE);

    userRanking.writeAsText("file:///tmp/areauserranking", FileSystem.WriteMode.OVERWRITE).setParallelism(1);



    fem.getExecutionEnvironment().execute("AreaCalculationProcess");

}
 
Example 20
Source File: PathCalculator.java    From OSTMap with Apache License 2.0 3 votes vote down vote up
/**
 * run area calculation process
 * @param path path to config file
 * @throws Exception
 */
public void run(String path) throws Exception {

    readConfig(path);

    FlinkEnvManager fem = new FlinkEnvManager(path, "pathJob",
            TableIdentifier.RAW_TWITTER_DATA.get(),
            "HighScore");


    DataSet<Tuple2<Key,Value>> rawTwitterDataRows = fem.getDataFromAccumulo();

    DataSet<Tuple2<String,String>> geoList = rawTwitterDataRows.flatMap(new PathGeoExtrationFlatMap());

    DataSet<Tuple2<String,String>> reducedGroup = geoList
                                                    .groupBy(0)
                                                    .reduceGroup(new PathCoordGroupReduce());

    DataSet<Tuple3<String,Double,Integer>> userRanking = reducedGroup.flatMap(new PathGeoCalcFlatMap())
            .sortPartition(1, Order.DESCENDING).setParallelism(1);

    DataSet<Tuple2<Text,Mutation>> topTen = userRanking
                                                    .groupBy(2)
                                                    .reduceGroup(new TopTenGroupReduce("td"));

    topTen.output(fem.getHadoopOF());

    fem.getExecutionEnvironment().execute("PathProcess");

    TextOutputFormat<String> tof = new TextOutputFormat<>(new Path("file:///tmp/pathuserranking"));
    tof.setWriteMode(FileSystem.WriteMode.OVERWRITE);

    userRanking.writeAsText("file:///tmp/pathuserranking", FileSystem.WriteMode.OVERWRITE).setParallelism(1);



    fem.getExecutionEnvironment().execute("PathCalculationProcess");

}