Java Code Examples for org.apache.flink.api.java.DataSet#print()

The following examples show how to use org.apache.flink.api.java.DataSet#print() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WordCountWithInnerClass.java    From flink with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	// set up the execution environment
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// get input data
	DataSet<String> text = StaticData.getDefaultTextLineDataSet(env);

	DataSet<Tuple2<String, Integer>> counts =
		// split up the lines in pairs (2-tuples) containing: (word,1)
		text.flatMap(new Tokenizer())
			// group by the tuple field "0" and sum up tuple field "1"
			.groupBy(0)
			.sum(1);

	// emit result
	counts.print();

	// execute program
	env.execute("WordCount Example");
}
 
Example 2
Source File: WordCountWithInnerClass.java    From flink with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	// set up the execution environment
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// get input data
	DataSet<String> text = StaticData.getDefaultTextLineDataSet(env);

	DataSet<Tuple2<String, Integer>> counts =
		// split up the lines in pairs (2-tuples) containing: (word,1)
		text.flatMap(new Tokenizer())
			// group by the tuple field "0" and sum up tuple field "1"
			.groupBy(0)
			.sum(1);

	// emit result
	counts.print();

	// execute program
	env.execute("WordCount Example");
}
 
Example 3
Source File: DataFlinkLoaderTest.java    From toolbox with Apache License 2.0 6 votes vote down vote up
public static void test1() throws Exception {
    //Set-up Flink session.
    Configuration conf = new Configuration();
    conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
    final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
            env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

    DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFile(env,
            "../datasets/simulated/test_not_modify/SmallDataSet.arff", false);
    DataSet<DataInstance> data = dataFlink.getDataSet();

    data.print();

    List<DataInstance> instanceList = data.collect();

    assertEquals(16, instanceList.size());
    List<String> names = Arrays.asList("A", "B", "C", "D", "E", "G");
    List<Integer> states = Arrays.asList(2, 3, 2, 2, 2, -1);

    List<Attribute> atts = dataFlink.getAttributes().getListOfNonSpecialAttributes();
    for (int i = 0; i < names.size(); i++) {
        if (Main.VERBOSE) System.out.println(names.get(i));
        assertEquals(atts.get(i).getName(), names.get(i));
        assertEquals(atts.get(i).getNumberOfStates(), states.get(i).intValue());
    }
}
 
Example 4
Source File: EmptyFieldsCountAccumulator.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void main(final String[] args) throws Exception {

		final ParameterTool params = ParameterTool.fromArgs(args);

		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// make parameters available in the web interface
		env.getConfig().setGlobalJobParameters(params);

		// get the data set
		final DataSet<StringTriple> file = getDataSet(env, params);

		// filter lines with empty fields
		final DataSet<StringTriple> filteredLines = file.filter(new EmptyFieldFilter());

		// Here, we could do further processing with the filtered lines...
		JobExecutionResult result;
		// output the filtered lines
		if (params.has("output")) {
			filteredLines.writeAsCsv(params.get("output"));
			// execute program
			result = env.execute("Accumulator example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			filteredLines.print();
			result = env.getLastJobExecutionResult();
		}

		// get the accumulator result via its registration key
		final List<Integer> emptyFields = result.getAccumulatorResult(EMPTY_FIELD_ACCUMULATOR);
		System.out.format("Number of detected empty fields per column: %s\n", emptyFields);
	}
 
Example 5
Source File: FilterWithIndirection.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSet<String> input = env.fromElements("Please filter", "the words", "but not this");

	DataSet<String> output = input.filter(UtilFunctionWrapper.UtilFunction.getWordFilter());
	output.print();

	env.execute();
}
 
Example 6
Source File: TestParserMapFunctionAvroInline.java    From logparser with Apache License 2.0 5 votes vote down vote up
@Test
public void testInlineDefinitionAvro() throws Exception {
    // set up the execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    DataSet<String> input = env.fromElements(TestCase.getInputLine());

    DataSet<Click> filledTestRecords = input
        .map(new RichMapFunction<String, Click>() {
            private Parser<ClickSetter> parser;

            @Override
            public void open(org.apache.flink.configuration.Configuration parameters) {
                parser = new HttpdLoglineParser<>(ClickSetter.class, TestCase.getLogFormat())
                    .addDissector(new ScreenResolutionDissector())
                    .addTypeRemapping("request.firstline.uri.query.g", "HTTP.URI")
                    .addTypeRemapping("request.firstline.uri.query.r", "HTTP.URI")
                    .addTypeRemapping("request.firstline.uri.query.s", "SCREENRESOLUTION")
                    .addDissector(new GeoIPISPDissector(ISP_TEST_MMDB))
                    .addDissector(new GeoIPCityDissector(CITY_TEST_MMDB));
            }

            @Override
            public Click map(String line) throws Exception {
                return parser.parse(line).build();
            }
        }).name("Extract Elements from logline");

    filledTestRecords.print();

    List<Click> result = filledTestRecords.collect();

    assertEquals(1, result.size());
    assertEquals(ExpectedClick.create(), result.get(0));
}
 
Example 7
Source File: TestParserMapFunctionInline.java    From logparser with Apache License 2.0 5 votes vote down vote up
@Test
public void testInlineDefinition() throws Exception {
    // set up the execution environment
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    DataSet<String> input = env.fromElements(TestCase.getInputLine());

    DataSet<TestRecord> filledTestRecords = input
        .map(new RichMapFunction<String, TestRecord>() {
            private Parser<TestRecord> parser;

            @Override
            public void open(org.apache.flink.configuration.Configuration parameters) throws Exception {
                parser = TestCase.createTestParser();
            }

            @Override
            public TestRecord map(String line) throws Exception {
                return parser.parse(line);
            }
        }).name("Extract Elements from logline");

    filledTestRecords.print();

    List<TestRecord> result = filledTestRecords.collect();

    assertEquals(1, result.size());
    assertEquals(new TestRecord().setFullValid(), result.get(0));
}
 
Example 8
Source File: GSASingleSourceShortestPaths.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

		if (!parseParameters(args)) {
			return;
		}

		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		DataSet<Edge<Long, Double>> edges = getEdgeDataSet(env);

		Graph<Long, Double, Double> graph = Graph.fromDataSet(edges, new InitVertices(srcVertexId), env);

		// Execute the GSA iteration
		Graph<Long, Double, Double> result = graph.runGatherSumApplyIteration(
				new CalculateDistances(), new ChooseMinDistance(), new UpdateDistance(), maxIterations);

		// Extract the vertices as the result
		DataSet<Vertex<Long, Double>> singleSourceShortestPaths = result.getVertices();

		// emit result
		if (fileOutput) {
			singleSourceShortestPaths.writeAsCsv(outputPath, "\n", ",");

			// since file sinks are lazy, we trigger the execution explicitly
			env.execute("GSA Single Source Shortest Paths");
		} else {
			singleSourceShortestPaths.print();
		}

	}
 
Example 9
Source File: ParquetProtobufExample.java    From parquet-flinktacular with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

		//output
		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
		DataSet<Tuple2<Void, Person>> data = generateDataSet(env);
		writeProtobuf(data, "newpath");
		data.print();
		env.execute("Parquet output");

		//input
		final ExecutionEnvironment env2 = ExecutionEnvironment.getExecutionEnvironment();
		DataSet<Tuple2<Void, Person.Builder>> input = readProtobuf(env2, "newpath");
		input.map(new TupleToProto()).print();
		env2.execute("Parquet input");
	}
 
Example 10
Source File: EuclideanGraphWeighing.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

		if (!parseParameters(args)) {
			return;
		}

		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		DataSet<Vertex<Long, Point>> vertices = getVerticesDataSet(env);

		DataSet<Edge<Long, Double>> edges = getEdgesDataSet(env);

		Graph<Long, Point, Double> graph = Graph.fromDataSet(vertices, edges, env);

		// the edge value will be the Euclidean distance between its src and trg vertex
		DataSet<Tuple3<Long, Long, Double>> edgesWithEuclideanWeight = graph.getTriplets()
				.map(new MapFunction<Triplet<Long, Point, Double>, Tuple3<Long, Long, Double>>() {

					@Override
					public Tuple3<Long, Long, Double> map(Triplet<Long, Point, Double> triplet)
							throws Exception {

						Vertex<Long, Point> srcVertex = triplet.getSrcVertex();
						Vertex<Long, Point> trgVertex = triplet.getTrgVertex();

						return new Tuple3<>(srcVertex.getId(), trgVertex.getId(),
							srcVertex.getValue().euclideanDistance(trgVertex.getValue()));
					}
				});

		Graph<Long, Point, Double> resultedGraph = graph.joinWithEdges(edgesWithEuclideanWeight,
				new EdgeJoinFunction<Double, Double>() {

					public Double edgeJoin(Double edgeValue, Double inputValue) {
						return inputValue;
					}
				});

		// retrieve the edges from the final result
		DataSet<Edge<Long, Double>> result = resultedGraph.getEdges();

		// emit result
		if (fileOutput) {
			result.writeAsCsv(outputPath, "\n", ",");

			// since file sinks are lazy, we trigger the execution explicitly
			env.execute("Euclidean Graph Weighing Example");
		} else {
			result.print();
		}

	}
 
Example 11
Source File: DataSetConversionUtilTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testE2E() throws Exception {
	ExecutionEnvironment env = MLEnvironmentFactory.getDefault().getExecutionEnvironment();

	DataSet<Row> input = env.fromElements(Row.of("a"));

	Table table1 = DataSetConversionUtil.toTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, input, new String[]{"word"});
	Assert.assertEquals(
		new TableSchema(new String[]{"word"}, new TypeInformation[]{TypeInformation.of(String.class)}),
		table1.getSchema()
	);

	DataSet<Row> genericInput1 = input.map(new GenericTypeMap());

	// Force type should go through with explicit type info.
	Table table2 = DataSetConversionUtil.toTable(
		MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID,
		genericInput1,
		new String[]{"word"},
		new TypeInformation[]{TypeInformation.of(Integer.class)}
	);

	Assert.assertEquals(
		new TableSchema(new String[]{"word"}, new TypeInformation[]{TypeInformation.of(Integer.class)}),
		table2.getSchema()
	);

	DataSet<Row> genericInput2 = input.map(new GenericTypeMap());

	// Force type should go through with table schema.
	Table table3 = DataSetConversionUtil.toTable(
		MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID,
		genericInput2,
		new TableSchema(
			new String[]{"word"},
			new TypeInformation[]{TypeInformation.of(Integer.class)}
		)
	);

	Assert.assertEquals(
		new TableSchema(new String[]{"word"}, new TypeInformation[]{TypeInformation.of(Integer.class)}),
		table3.getSchema()
	);

	// applying toTable again on the same input should fail
	thrown.expect(IllegalStateException.class);
	DataSetConversionUtil.toTable(
		MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID,
		genericInput2,
		new TableSchema(
			new String[]{"word"},
			new TypeInformation[]{TypeInformation.of(Integer.class)}
		)
	);

	// Validation should fail without correct type inference.
	DataSet<Row> genericInput3 = input.map(new GenericTypeMap());
	thrown.expect(ValidationException.class);
	DataSetConversionUtil.toTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, genericInput3, new String[]{"word"});

	// Output should go through when using correct type to output.
	DataSet<Row> output = DataSetConversionUtil.fromTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, table1);
	output.print();

	// Output should NOT go through when using incorrect type forcing.
	thrown.expect(ExecutionException.class);
	DataSetConversionUtil.fromTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, table2).print();
}
 
Example 12
Source File: IncrementalSSSP.java    From flink with Apache License 2.0 4 votes vote down vote up
public static void main(String [] args) throws Exception {

		if (!parseParameters(args)) {
			return;
		}

		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		Edge<Long, Double> edgeToBeRemoved = getEdgeToBeRemoved();

		Graph<Long, Double, Double> graph = IncrementalSSSP.getGraph(env);

		// Assumption: all minimum weight paths are kept
		Graph<Long, Double, Double> ssspGraph = IncrementalSSSP.getSSSPGraph(env);

		// remove the edge
		graph.removeEdge(edgeToBeRemoved);

		// configure the iteration
		ScatterGatherConfiguration parameters = new ScatterGatherConfiguration();

		if (isInSSSP(edgeToBeRemoved, ssspGraph.getEdges())) {

			parameters.setDirection(EdgeDirection.IN);
			parameters.setOptDegrees(true);

			// run the scatter-gather iteration to propagate info
			Graph<Long, Double, Double> result = ssspGraph.runScatterGatherIteration(new InvalidateMessenger(edgeToBeRemoved),
					new VertexDistanceUpdater(), maxIterations, parameters);

			DataSet<Vertex<Long, Double>> resultedVertices = result.getVertices();

			// Emit results
			if (fileOutput) {
				resultedVertices.writeAsCsv(outputPath, "\n", ",");
				env.execute("Incremental SSSP Example");
			} else {
				resultedVertices.print();
			}
		} else {
			// print the vertices
			if (fileOutput) {
				graph.getVertices().writeAsCsv(outputPath, "\n", ",");
				env.execute("Incremental SSSP Example");
			} else {
				graph.getVertices().print();
			}
		}
	}
 
Example 13
Source File: ParallelVBTest.java    From toolbox with Apache License 2.0 4 votes vote down vote up
public void testingMLParallelPosteriors() throws Exception {

        //Set-up Flink session.
        Configuration conf = new Configuration();
        conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
        final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
                env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

        DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFolder(env,
                "../datasets/simulated/test_not_modify/MONTH1.arff", true);
        //DataFlink<DataInstance> dataStream = DataFlinkLoader.loadDataFromFile(env,
        //        "./datasets/dataStream/test_not_modify/SmallDataSet.arff", false);

        //Structure learning is excluded from the test, i.e., we use directly the initial Asia network structure
        // and just learn then test the parameter learning

        //Parameter Learning
        ParallelVB parallelVB = new ParallelVB();
        parallelVB.setOutput(true);
        parallelVB.setSeed(5);
        parallelVB.setBatchSize(100);
        parallelVB.setLocalThreshold(0.001);
        parallelVB.setGlobalThreshold(0.05);
        parallelVB.setMaximumLocalIterations(100);
        parallelVB.setMaximumGlobalIterations(100);

        DAG dag = DAGGenerator.getHiddenNaiveBayesStructure(dataFlink.getAttributes(), "GlobalHidden", 2);
        if (Main.VERBOSE) System.out.println(dag.toString());
        parallelVB.setDAG(dag);
        parallelVB.initLearning();
        parallelVB.updateModel(dataFlink);
        BayesianNetwork bnet = parallelVB.getLearntBayesianNetwork();

        if (Main.VERBOSE) System.out.println(bnet.toString());

        DataSet<DataPosterior> dataPosteriorDataSet = parallelVB.computePosterior(dataFlink,Arrays.asList(dag.getVariables().getVariableByName("GlobalHidden")));

        dataPosteriorDataSet.print();

        //DataSetSerializer.serializeDataSet(dataPosteriorDataSet, "./datasets/tmp.ser");
        //dataPosteriorDataSet = DataSetSerializer.deserializeDataSet("./datasets/tmp.ser");

        dataPosteriorDataSet.print();
    }
 
Example 14
Source File: PageRank.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

		ParameterTool params = ParameterTool.fromArgs(args);

		final int numPages = params.getInt("numPages", PageRankData.getNumberOfPages());
		final int maxIterations = params.getInt("iterations", 10);

		// set up execution environment
		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// make the parameters available to the web ui
		env.getConfig().setGlobalJobParameters(params);

		// get input data
		DataSet<Long> pagesInput = getPagesDataSet(env, params);
		DataSet<Tuple2<Long, Long>> linksInput = getLinksDataSet(env, params);

		// assign initial rank to pages
		DataSet<Tuple2<Long, Double>> pagesWithRanks = pagesInput.
				map(new RankAssigner((1.0d / numPages)));

		// build adjacency list from link input
		DataSet<Tuple2<Long, Long[]>> adjacencyListInput =
				linksInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList());

		// set iterative data set
		IterativeDataSet<Tuple2<Long, Double>> iteration = pagesWithRanks.iterate(maxIterations);

		DataSet<Tuple2<Long, Double>> newRanks = iteration
				// join pages with outgoing edges and distribute rank
				.join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch())
				// collect and sum ranks
				.groupBy(0).aggregate(SUM, 1)
				// apply dampening factor
				.map(new Dampener(DAMPENING_FACTOR, numPages));

		DataSet<Tuple2<Long, Double>> finalPageRanks = iteration.closeWith(
				newRanks,
				newRanks.join(iteration).where(0).equalTo(0)
				// termination condition
				.filter(new EpsilonFilter()));

		// emit result
		if (params.has("output")) {
			finalPageRanks.writeAsCsv(params.get("output"), "\n", " ");
			// execute program
			env.execute("Basic Page Rank Example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			finalPageRanks.print();
		}
	}
 
Example 15
Source File: EnumTriangles.java    From flink with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

		// Checking input parameters
		final ParameterTool params = ParameterTool.fromArgs(args);

		// set up execution environment
		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// make parameters available in the web interface
		env.getConfig().setGlobalJobParameters(params);

		// read input data
		DataSet<Edge> edges;
		if (params.has("edges")) {
			edges = env.readCsvFile(params.get("edges"))
					.fieldDelimiter(" ")
					.includeFields(true, true)
					.types(Integer.class, Integer.class)
					.map(new TupleEdgeConverter());
		} else {
			System.out.println("Executing EnumTriangles example with default edges data set.");
			System.out.println("Use --edges to specify file input.");
			edges = EnumTrianglesData.getDefaultEdgeDataSet(env);
		}

		// project edges by vertex id
		DataSet<Edge> edgesById = edges
				.map(new EdgeByIdProjector());

		DataSet<Triad> triangles = edgesById
				// build triads
				.groupBy(Edge.V1).sortGroup(Edge.V2, Order.ASCENDING).reduceGroup(new TriadBuilder())
				// filter triads
				.join(edgesById).where(Triad.V2, Triad.V3).equalTo(Edge.V1, Edge.V2).with(new TriadFilter());

		// emit result
		if (params.has("output")) {
			triangles.writeAsCsv(params.get("output"), "\n", ",");
			// execute program
			env.execute("Basic Triangle Enumeration Example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			triangles.print();
		}
	}
 
Example 16
Source File: dVMPv1Test.java    From toolbox with Apache License 2.0 4 votes vote down vote up
public void testingMLParallelPosteriorsAssignment() throws Exception {

        //Set-up Flink session.
        Configuration conf = new Configuration();
        conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
        final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
                env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

        DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFolder(env,
                "../datasets/simulated/test_not_modify/MONTH1.arff", true);

        //DataFlink<DataInstance> dataStream = DataFlinkLoader.loadDataFromFile(env,
        //        "./datasets/dataStream/test_not_modify/SmallDataSet.arff", false);


        //Structure learning is excluded from the test, i.e., we use directly the initial Asia network structure
        // and just learn then test the parameter learning

        //Parameter Learning
        dVMPv1 parallelVB = new dVMPv1();
        parallelVB.setOutput(true);
        parallelVB.setSeed(5);
        parallelVB.setBatchSize(100);
        parallelVB.setLocalThreshold(0.001);
        parallelVB.setGlobalThreshold(0.05);
        parallelVB.setMaximumLocalIterations(100);
        parallelVB.setMaximumGlobalIterations(100);

        DAG dag = DAGGenerator.getHiddenNaiveBayesStructure(dataFlink.getAttributes(), "GlobalHidden", 2);
        if (Main.VERBOSE) System.out.println(dag.toString());
        parallelVB.setDAG(dag);
        parallelVB.initLearning();
        parallelVB.updateModel(dataFlink);
        BayesianNetwork bnet = parallelVB.getLearntBayesianNetwork();

        if (Main.VERBOSE) System.out.println(bnet.toString());
        List<Variable> list = new ArrayList<>();
        list.add(dag.getVariables().getVariableByName("GlobalHidden"));
        list.add(dag.getVariables().getVariableById(0));

        DataSet<DataPosteriorAssignment> dataPosteriorDataSet = parallelVB.computePosteriorAssignment(dataFlink,list);

        dataPosteriorDataSet.print();


        dataPosteriorDataSet.print();
    }
 
Example 17
Source File: GeoTempFlatMapTest.java    From OSTMap with Apache License 2.0 4 votes vote down vote up
@Test
public void testFlatMap() throws Exception {

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    ByteBuffer key1 = ByteBuffer.allocate(12);
    key1.putLong(1459350458).putInt(123);
    DataSet<Tuple2<Key,Value>> input = env.fromElements(
            new Tuple2<>(new Key(new Text(key1.array()),new Text("t"),new Text("")),
                    new Value(("{\n" +
                            "  \"created_at\": \"Wed Mar 30 15:07:38 +0000 2016\",\n" +
                            "  \"id\": 715193777833582592,\n" +
                            "  \"user\": {\n" +
                            "    \"id\": 2243967693,\n" +
                            "    \"id_str\": \"2243967693\",\n" +
                            "  },\n" +
                            "  \"geo\": {\n" +
                            "    \"type\": \"Point\",\n" +
                            "    \"coordinates\": [\n" +
                            "      41.00870620,\n" +
                            "      29.21240342\n" +
                            "    ]\n" +
                            "  },\n" +
                            "  \"coordinates\": {\n" +
                            "    \"type\": \"Point\",\n" +
                            "    \"coordinates\": [\n" +
                            "      29.21240342,\n" +
                            "      41.00870620\n" +
                            "    ]\n" +
                            "  },\n" +
                            "  \"place\": {\n" +
                            "    \"id\": \"5e02a0f0d91c76d2\",\n" +
                            "    \"url\": \"https:\\/\\/api.twitter.com\\/1.1\\/geo\\/id\\/5e02a0f0d91c76d2.json\",\n" +
                            "    \"place_type\": \"city\",\n" +
                            "    \"name\": \"\\u0130stanbul\",\n" +
                            "    \"full_name\": \"\\u0130stanbul, T\\u00fcrkiye\",\n" +
                            "    \"country_code\": \"TR\",\n" +
                            "    \"country\": \"T\\u00fcrkiye\",\n" +
                            "    \"bounding_box\": {\n" +
                            "      \"type\": \"Polygon\",\n" +
                            "      \"coordinates\": [\n" +
                            "        [\n" +
                            "          [\n" +
                            "            28.632104,\n" +
                            "            40.802734\n" +
                            "          ],\n" +
                            "          [\n" +
                            "            28.632104,\n" +
                            "            41.239907\n" +
                            "          ],\n" +
                            "          [\n" +
                            "            29.378341,\n" +
                            "            41.239907\n" +
                            "          ],\n" +
                            "          [\n" +
                            "            29.378341,\n" +
                            "            40.802734\n" +
                            "          ]\n" +
                            "        ]\n" +
                            "      ]\n" +
                            "    },\n" +
                            "    \"attributes\": {}\n" +
                            "  },\n" +
                            "  \"timestamp_ms\": \"1459350458950\"\n" +
                            "}\n").getBytes())));

    DataSet<Tuple2<Text,Mutation>> output = input.flatMap(
            new GeoTempFlatMap("table"));



    output.print();
    assertEquals(output.count(), 1);

}
 
Example 18
Source File: WebLogAnalysis.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

		final ParameterTool params = ParameterTool.fromArgs(args);

		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		env.getConfig().setGlobalJobParameters(params);

		// get input data
		DataSet<Tuple2<String, String>> documents = getDocumentsDataSet(env, params);
		DataSet<Tuple3<Integer, String, Integer>> ranks = getRanksDataSet(env, params);
		DataSet<Tuple2<String, String>> visits = getVisitsDataSet(env, params);

		// Retain documents with keywords
		DataSet<Tuple1<String>> filterDocs = documents
				.filter(new FilterDocByKeyWords())
				.project(0);

		// Filter ranks by minimum rank
		DataSet<Tuple3<Integer, String, Integer>> filterRanks = ranks
				.filter(new FilterByRank());

		// Filter visits by visit date
		DataSet<Tuple1<String>> filterVisits = visits
				.filter(new FilterVisitsByDate())
				.project(0);

		// Join the filtered documents and ranks, i.e., get all URLs with min rank and keywords
		DataSet<Tuple3<Integer, String, Integer>> joinDocsRanks =
				filterDocs.join(filterRanks)
							.where(0).equalTo(1)
							.projectSecond(0, 1, 2);

		// Anti-join urls with visits, i.e., retain all URLs which have NOT been visited in a certain time
		DataSet<Tuple3<Integer, String, Integer>> result =
				joinDocsRanks.coGroup(filterVisits)
								.where(1).equalTo(0)
								.with(new AntiJoinVisits());

		// emit result
		if (params.has("output")) {
			result.writeAsCsv(params.get("output"), "\n", "|");
			// execute program
			env.execute("WebLogAnalysis Example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			result.print();
		}
	}
 
Example 19
Source File: dVMPTest.java    From toolbox with Apache License 2.0 4 votes vote down vote up
public void testingMLParallelPosteriorsAssignment() throws Exception {

        //Set-up Flink session.
        Configuration conf = new Configuration();
        conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
        final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
                env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

        DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFolder(env,
                "../datasets/simulated/test_not_modify/MONTH1.arff", true);

        //DataFlink<DataInstance> dataStream = DataFlinkLoader.loadDataFromFile(env,
        //        "./datasets/dataStream/test_not_modify/SmallDataSet.arff", false);


        //Structure learning is excluded from the test, i.e., we use directly the initial Asia network structure
        // and just learn then test the parameter learning

        //Parameter Learning
        dVMP parallelVB = new dVMP();
        parallelVB.setOutput(true);
        parallelVB.setSeed(5);
        parallelVB.setBatchSize(100);
        parallelVB.setLocalThreshold(0.001);
        parallelVB.setGlobalThreshold(0.05);
        parallelVB.setMaximumLocalIterations(100);
        parallelVB.setMaximumGlobalIterations(100);

        DAG dag = DAGGenerator.getHiddenNaiveBayesStructure(dataFlink.getAttributes(), "GlobalHidden", 2);
        if (Main.VERBOSE) System.out.println(dag.toString());
        parallelVB.setDAG(dag);
        parallelVB.initLearning();
        parallelVB.updateModel(dataFlink);
        BayesianNetwork bnet = parallelVB.getLearntBayesianNetwork();

        if (Main.VERBOSE) System.out.println(bnet.toString());
        List<Variable> list = new ArrayList<>();
        list.add(dag.getVariables().getVariableByName("GlobalHidden"));
        list.add(dag.getVariables().getVariableById(0));

        DataSet<DataPosteriorAssignment> dataPosteriorDataSet = parallelVB.computePosteriorAssignment(dataFlink,list);

        dataPosteriorDataSet.print();


        dataPosteriorDataSet.print();
    }
 
Example 20
Source File: PageRank.java    From flink with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

		ParameterTool params = ParameterTool.fromArgs(args);

		final int numPages = params.getInt("numPages", PageRankData.getNumberOfPages());
		final int maxIterations = params.getInt("iterations", 10);

		// set up execution environment
		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// make the parameters available to the web ui
		env.getConfig().setGlobalJobParameters(params);

		// get input data
		DataSet<Long> pagesInput = getPagesDataSet(env, params);
		DataSet<Tuple2<Long, Long>> linksInput = getLinksDataSet(env, params);

		// assign initial rank to pages
		DataSet<Tuple2<Long, Double>> pagesWithRanks = pagesInput.
				map(new RankAssigner((1.0d / numPages)));

		// build adjacency list from link input
		DataSet<Tuple2<Long, Long[]>> adjacencyListInput =
				linksInput.groupBy(0).reduceGroup(new BuildOutgoingEdgeList());

		// set iterative data set
		IterativeDataSet<Tuple2<Long, Double>> iteration = pagesWithRanks.iterate(maxIterations);

		DataSet<Tuple2<Long, Double>> newRanks = iteration
				// join pages with outgoing edges and distribute rank
				.join(adjacencyListInput).where(0).equalTo(0).flatMap(new JoinVertexWithEdgesMatch())
				// collect and sum ranks
				.groupBy(0).aggregate(SUM, 1)
				// apply dampening factor
				.map(new Dampener(DAMPENING_FACTOR, numPages));

		DataSet<Tuple2<Long, Double>> finalPageRanks = iteration.closeWith(
				newRanks,
				newRanks.join(iteration).where(0).equalTo(0)
				// termination condition
				.filter(new EpsilonFilter()));

		// emit result
		if (params.has("output")) {
			finalPageRanks.writeAsCsv(params.get("output"), "\n", " ");
			// execute program
			env.execute("Basic Page Rank Example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			finalPageRanks.print();
		}
	}