Java Code Examples for org.apache.flink.api.java.ExecutionEnvironment#createLocalEnvironment()

The following examples show how to use org.apache.flink.api.java.ExecutionEnvironment#createLocalEnvironment() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StochasticVITest.java    From toolbox with Apache License 2.0 6 votes vote down vote up
public static void testGaussian1() throws IOException, ClassNotFoundException {


        //for (int i = 2; i <3; i++) {
        BayesianNetwork bn = BayesianNetworkLoader.loadFromFile("../networks/simulated/Normal_MultinomialParents.bn");
        //bn.randomInitialization(new Random(0));

        BayesianNetworkSampler sampler = new BayesianNetworkSampler(bn);
        sampler.setSeed(2);
        DataStream<DataInstance> data = sampler.sampleToDataStream(10000);

        //Set-up Flink session.
        Configuration conf = new Configuration();
        conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
        final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
                env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

        baseTest(env, data, bn, 10000, 1000, 0.2);

        //}
    }
 
Example 2
Source File: dVMPTest.java    From toolbox with Apache License 2.0 6 votes vote down vote up
public static void testGaussian1() throws IOException, ClassNotFoundException {


        //for (int i = 2; i <3; i++) {
            BayesianNetwork bn = BayesianNetworkLoader.loadFromFile("../networks/simulated/Normal_MultinomialParents.bn");
            //bn.randomInitialization(new Random(0));

            BayesianNetworkSampler sampler = new BayesianNetworkSampler(bn);
            sampler.setSeed(2);
            DataStream<DataInstance> data = sampler.sampleToDataStream(10000);

            //Set-up Flink session.
            Configuration conf = new Configuration();
            conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
            final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
                    env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

            baseTest(env, data, bn, 1000, 0.2);

        //}
    }
 
Example 3
Source File: PropertyDataSourceTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void checkSinglePartitionedOrderedSource2() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	DataSource<Tuple2<Long, String>> data =
			env.readCsvFile("/some/path").types(Long.class, String.class);

	data.getSplitDataProperties()
			.splitsPartitionedBy(1)
			.splitsOrderedBy(new int[]{1, 0}, new Order[]{Order.ASCENDING, Order.DESCENDING});

	data.output(new DiscardingOutputFormat<Tuple2<Long, String>>());

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	SourcePlanNode sourceNode = (SourcePlanNode) sinkNode.getPredecessor();

	GlobalProperties gprops = sourceNode.getGlobalProperties();
	LocalProperties lprops = sourceNode.getLocalProperties();

	Assert.assertTrue((new FieldSet(gprops.getPartitioningFields().toArray())).equals(new FieldSet(1)));
	Assert.assertTrue(gprops.getPartitioning() == PartitioningProperty.ANY_PARTITIONING);
	Assert.assertTrue((new FieldSet(lprops.getGroupedFields().toArray())).equals(new FieldSet(1, 0)));
	Assert.assertTrue(lprops.getOrdering() == null);

}
 
Example 4
Source File: ReplicatingDataSourceTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Tests join program with replicated data source behind multiple map ops.
 */
@Test
public void checkJoinWithReplicatedSourceInputBehindMultiMaps() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.filter(new NoFilter())
			.mapPartition(new IdPMap())
			.flatMap(new IdFlatMap())
			.map(new IdMap())
			.join(source2).where("*").equalTo("*")
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	// when join should have forward strategy on both sides
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	DualInputPlanNode joinNode = (DualInputPlanNode) sinkNode.getPredecessor();

	ShipStrategyType joinIn1 = joinNode.getInput1().getShipStrategy();
	ShipStrategyType joinIn2 = joinNode.getInput2().getShipStrategy();

	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn1);
	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn2);
}
 
Example 5
Source File: ReplicatingDataSourceTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Tests join program with replicated data source behind map.
 */
@Test
public void checkJoinWithReplicatedSourceInputBehindMap() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.map(new IdMap())
			.join(source2).where("*").equalTo("*")
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	// when join should have forward strategy on both sides
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	DualInputPlanNode joinNode = (DualInputPlanNode) sinkNode.getPredecessor();

	ShipStrategyType joinIn1 = joinNode.getInput1().getShipStrategy();
	ShipStrategyType joinIn2 = joinNode.getInput2().getShipStrategy();

	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn1);
	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn2);
}
 
Example 6
Source File: ReplicatingDataSourceTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Tests join program with replicated data source behind flatMap.
 */
@Test
public void checkJoinWithReplicatedSourceInputBehindFlatMap() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.flatMap(new IdFlatMap())
			.join(source2).where("*").equalTo("*")
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	// when join should have forward strategy on both sides
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	DualInputPlanNode joinNode = (DualInputPlanNode) sinkNode.getPredecessor();

	ShipStrategyType joinIn1 = joinNode.getInput1().getShipStrategy();
	ShipStrategyType joinIn2 = joinNode.getInput2().getShipStrategy();

	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn1);
	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn2);
}
 
Example 7
Source File: PropertyDataSourceTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void checkSinglePartitionedGroupedSource5() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	DataSource<Tuple3<Long, SomePojo, String>> data = env.fromCollection(tuple3PojoData, tuple3PojoType);

	data.getSplitDataProperties()
			.splitsPartitionedBy("f2")
			.splitsGroupedBy("f2");

	data.output(new DiscardingOutputFormat<Tuple3<Long, SomePojo, String>>());

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	SourcePlanNode sourceNode = (SourcePlanNode) sinkNode.getPredecessor();

	GlobalProperties gprops = sourceNode.getGlobalProperties();
	LocalProperties lprops = sourceNode.getLocalProperties();

	Assert.assertTrue((new FieldSet(gprops.getPartitioningFields().toArray())).equals(new FieldSet(4)));
	Assert.assertTrue(gprops.getPartitioning() == PartitioningProperty.ANY_PARTITIONING);
	Assert.assertTrue(new FieldSet(lprops.getGroupedFields().toArray()).equals(new FieldSet(4)));
	Assert.assertTrue(lprops.getOrdering() == null);

}
 
Example 8
Source File: dVMPv1Test.java    From toolbox with Apache License 2.0 5 votes vote down vote up
public static void testMultinomials2() throws IOException, ClassNotFoundException {
    Variables variables = new Variables();
    Variable varA = variables.newMultinomialVariable("A", 2);
    Variable varB = variables.newMultinomialVariable("B", 2);

    DAG dag = new DAG(variables);

    dag.getParentSet(varB).addParent(varA);

    BayesianNetwork bn = new BayesianNetwork(dag);

    Multinomial distA = bn.getConditionalDistribution(varA);
    Multinomial_MultinomialParents distB = bn.getConditionalDistribution(varB);

    distA.setProbabilities(new double[]{0.6, 0.4});
    distB.getMultinomial(0).setProbabilities(new double[]{0.75, 0.25});
    distB.getMultinomial(1).setProbabilities(new double[]{0.25, 0.75});

    BayesianNetworkSampler sampler = new BayesianNetworkSampler(bn);
    sampler.setSeed(2);
    DataStream<DataInstance> data = sampler.sampleToDataStream(1000);

    //Set-up Flink session.
    Configuration conf = new Configuration();
    conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
    final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
            env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

    baseTest(env, data, bn, 100, 0.05);

}
 
Example 9
Source File: PropertyDataSourceTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void checkSinglePartitionedGroupedSource2() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	DataSource<Tuple2<Long, String>> data =
			env.readCsvFile("/some/path").types(Long.class, String.class);

	data.getSplitDataProperties()
			.splitsPartitionedBy(0)
			.splitsGroupedBy(1, 0);

	data.output(new DiscardingOutputFormat<Tuple2<Long, String>>());

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	SourcePlanNode sourceNode = (SourcePlanNode) sinkNode.getPredecessor();

	GlobalProperties gprops = sourceNode.getGlobalProperties();
	LocalProperties lprops = sourceNode.getLocalProperties();

	Assert.assertTrue((new FieldSet(gprops.getPartitioningFields().toArray())).equals(new FieldSet(0)));
	Assert.assertTrue(gprops.getPartitioning() == PartitioningProperty.ANY_PARTITIONING);
	Assert.assertTrue(new FieldSet(lprops.getGroupedFields().toArray()).equals(new FieldSet(0, 1)));
	Assert.assertTrue(lprops.getOrdering() == null);

}
 
Example 10
Source File: ReplicatingDataSourceTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Tests join program with replicated data source behind map partition.
 */
@Test
public void checkJoinWithReplicatedSourceInputBehindMapPartition() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	TupleTypeInfo<Tuple1<String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class);
	ReplicatingInputFormat<Tuple1<String>, FileInputSplit> rif =
			new ReplicatingInputFormat<Tuple1<String>, FileInputSplit>(new TupleCsvInputFormat<Tuple1<String>>(new Path("/some/path"), typeInfo));

	DataSet<Tuple1<String>> source1 = env.createInput(rif, new TupleTypeInfo<Tuple1<String>>(BasicTypeInfo.STRING_TYPE_INFO));
	DataSet<Tuple1<String>> source2 = env.readCsvFile("/some/otherpath").types(String.class);

	DataSink<Tuple2<Tuple1<String>, Tuple1<String>>> out = source1
			.mapPartition(new IdPMap())
			.join(source2).where("*").equalTo("*")
			.writeAsText("/some/newpath");

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	// when join should have forward strategy on both sides
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	DualInputPlanNode joinNode = (DualInputPlanNode) sinkNode.getPredecessor();

	ShipStrategyType joinIn1 = joinNode.getInput1().getShipStrategy();
	ShipStrategyType joinIn2 = joinNode.getInput2().getShipStrategy();

	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn1);
	Assert.assertEquals("Invalid ship strategy for an operator.", ShipStrategyType.FORWARD, joinIn2);
}
 
Example 11
Source File: DistinctTranslationTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void translateDistinctExpressionKey() {
	try {
		final int parallelism = 8;
		ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(parallelism);

		DataSet<CustomType> initialData = getSourcePojoDataSet(env);

		initialData.distinct("myInt").output(new DiscardingOutputFormat<CustomType>());

		Plan p = env.createProgramPlan();

		GenericDataSinkBase<?> sink = p.getDataSinks().iterator().next();

		// currently distinct is translated to a Reduce
		ReduceOperatorBase<?, ?> reducer = (ReduceOperatorBase<?, ?>) sink.getInput();

		// check types
		assertEquals(initialData.getType(), reducer.getOperatorInfo().getInputType());
		assertEquals(initialData.getType(), reducer.getOperatorInfo().getOutputType());

		// check keys
		assertArrayEquals(new int[] {0}, reducer.getKeyColumns(0));

		// parallelism was not configured on the operator
		assertTrue(reducer.getParallelism() == 1 || reducer.getParallelism() == -1);

		assertTrue(reducer.getInput() instanceof GenericDataSourceBase<?, ?>);
	}
	catch (Exception e) {
		System.err.println(e.getMessage());
		e.printStackTrace();
		fail("Test caused an error: " + e.getMessage());
	}
}
 
Example 12
Source File: PropertyDataSourceTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void checkSinglePartitionedOrderedSource6() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	DataSource<Tuple3<Long, SomePojo, String>> data = env.fromCollection(tuple3PojoData, tuple3PojoType);

	data.getSplitDataProperties()
			.splitsPartitionedBy("f1.intField")
			.splitsOrderedBy("f1", new Order[]{Order.DESCENDING});

	data.output(new DiscardingOutputFormat<Tuple3<Long, SomePojo, String>>());

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	SourcePlanNode sourceNode = (SourcePlanNode) sinkNode.getPredecessor();

	GlobalProperties gprops = sourceNode.getGlobalProperties();
	LocalProperties lprops = sourceNode.getLocalProperties();

	Assert.assertTrue((new FieldSet(gprops.getPartitioningFields().toArray())).equals(new FieldSet(2)));
	Assert.assertTrue(gprops.getPartitioning() == PartitioningProperty.ANY_PARTITIONING);
	Assert.assertTrue(new FieldSet(lprops.getGroupedFields().toArray()).equals(new FieldSet(1,2,3)));
	Assert.assertTrue(lprops.getOrdering() == null);

}
 
Example 13
Source File: PropertyDataSourceTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void checkSinglePartitionedSource2() {

	ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	DataSource<Tuple2<Long, String>> data =
			env.readCsvFile("/some/path").types(Long.class, String.class);

	data.getSplitDataProperties()
			.splitsPartitionedBy(1, 0);

	data.output(new DiscardingOutputFormat<Tuple2<Long,String>>());

	Plan plan = env.createProgramPlan();

	// submit the plan to the compiler
	OptimizedPlan oPlan = compileNoStats(plan);

	// check the optimized Plan
	SinkPlanNode sinkNode = oPlan.getDataSinks().iterator().next();
	SourcePlanNode sourceNode = (SourcePlanNode) sinkNode.getPredecessor();

	GlobalProperties gprops = sourceNode.getGlobalProperties();
	LocalProperties lprops = sourceNode.getLocalProperties();

	Assert.assertTrue((new FieldSet(gprops.getPartitioningFields().toArray())).equals(new FieldSet(0, 1)));
	Assert.assertTrue(gprops.getPartitioning() == PartitioningProperty.ANY_PARTITIONING);
	Assert.assertTrue(lprops.getGroupedFields() == null);
	Assert.assertTrue(lprops.getOrdering() == null);

}
 
Example 14
Source File: dVMPTest.java    From toolbox with Apache License 2.0 4 votes vote down vote up
public void testingMLParallelAsiaWithUpdate() throws IOException, ClassNotFoundException {

        //Set-up Flink session.
        Configuration conf = new Configuration();
        conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
        final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
                env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

        // load the true Asia Bayesian network
        BayesianNetwork asianet = BayesianNetworkLoader.loadFromFile("../networks/dataWeka/asia.bn");
        asianet.randomInitialization(new Random(0));
        if (Main.VERBOSE) System.out.println("\nAsia network \n ");
        //if (Main.VERBOSE) System.out.println(asianet.getDAG().outputString());
        if (Main.VERBOSE) System.out.println(asianet.toString());

        //Sampling from Asia BN
        BayesianNetworkSampler sampler = new BayesianNetworkSampler(asianet);
        sampler.setSeed(0);
        //Load the sampled data
        DataStream<DataInstance> data = sampler.sampleToDataStream(10000);

        DataStreamWriter.writeDataToFile(data, "../datasets/simulated/tmp.arff");

        DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFile(env, "../datasets/simulated/tmp.arff", false);

        //Structure learning is excluded from the test, i.e., we use directly the initial Asia network structure
        // and just learn then test the parameter learning

        //Parameter Learning
        dVMP parallelVB = new dVMP();
        parallelVB.setOutput(true);
        parallelVB.setSeed(5);
        parallelVB.setBatchSize(1000);
        parallelVB.setLocalThreshold(0.001);
        parallelVB.setGlobalThreshold(0.05);
        parallelVB.setMaximumLocalIterations(100);
        parallelVB.setMaximumGlobalIterations(100);


        parallelVB.setDAG(asianet.getDAG());
        parallelVB.initLearning();
        parallelVB.updateModel(dataFlink);
        BayesianNetwork bnet = parallelVB.getLearntBayesianNetwork();

        //Check if the probability distributions of each node
        for (Variable var : asianet.getVariables()) {
            if (Main.VERBOSE) System.out.println("\n------ Variable " + var.getName() + " ------");
            if (Main.VERBOSE) System.out.println("\nTrue distribution:\n" + asianet.getConditionalDistribution(var));
            if (Main.VERBOSE) System.out.println("\nLearned distribution:\n" + bnet.getConditionalDistribution(var));
            Assert.assertTrue(bnet.getConditionalDistribution(var).equalDist(asianet.getConditionalDistribution(var), 0.05));
        }

        //Or check directly if the true and learned networks are equals
        Assert.assertTrue(bnet.equalBNs(asianet, 0.05));
    }
 
Example 15
Source File: UnionTranslationTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void translateUnion3SortedGroup() {
	try {
		final int parallelism = 4;
		ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(parallelism);

		DataSet<Tuple3<Double, StringValue, LongValue>> dataset1 = getSourceDataSet(env, 2);

		DataSet<Tuple3<Double, StringValue, LongValue>> dataset2 = getSourceDataSet(env, 3);

		DataSet<Tuple3<Double, StringValue, LongValue>> dataset3 = getSourceDataSet(env, -1);

		dataset1.union(dataset2).union(dataset3)
				.groupBy((KeySelector<Tuple3<Double, StringValue, LongValue>, String>) value -> "")
				.sortGroup((KeySelector<Tuple3<Double, StringValue, LongValue>, String>) value -> "", Order.ASCENDING)
				.reduceGroup((GroupReduceFunction<Tuple3<Double, StringValue, LongValue>, String>) (values, out) -> {})
				.returns(String.class)
				.output(new DiscardingOutputFormat<>());

		Plan p = env.createProgramPlan();

		// The plan should look like the following one.
		//
		// DataSet1(2) - MapOperator(2)-+
		//	                            |- Union(-1) -+
		// DataSet2(3) - MapOperator(3)-+             |- Union(-1) - SingleInputOperator - Sink
		//                                            |
		//             DataSet3(-1) - MapOperator(-1)-+

		GenericDataSinkBase<?> sink = p.getDataSinks().iterator().next();
		Union secondUnionOperator = (Union) ((SingleInputOperator) sink.getInput()).getInput();

		// The first input of the second union should be the first union.
		Union firstUnionOperator = (Union) secondUnionOperator.getFirstInput();

		// The key mapper should be added to the second input stream of the second union.
		assertTrue(secondUnionOperator.getSecondInput() instanceof MapOperatorBase<?, ?, ?>);

		// The key mappers should be added to both of the two input streams for the first union.
		assertTrue(firstUnionOperator.getFirstInput() instanceof MapOperatorBase<?, ?, ?>);
		assertTrue(firstUnionOperator.getSecondInput() instanceof MapOperatorBase<?, ?, ?>);

		// The parallelisms of the key mappers should be equal to those of their inputs.
		assertEquals(firstUnionOperator.getFirstInput().getParallelism(), 2);
		assertEquals(firstUnionOperator.getSecondInput().getParallelism(), 3);
		assertEquals(secondUnionOperator.getSecondInput().getParallelism(), -1);

		// The union should always have the default parallelism.
		assertEquals(secondUnionOperator.getParallelism(), ExecutionConfig.PARALLELISM_DEFAULT);
		assertEquals(firstUnionOperator.getParallelism(), ExecutionConfig.PARALLELISM_DEFAULT);
	}
	catch (Exception e) {
		System.err.println(e.getMessage());
		e.printStackTrace();
		fail("Test caused an error: " + e.getMessage());
	}
}
 
Example 16
Source File: dVMPTest.java    From toolbox with Apache License 2.0 4 votes vote down vote up
public void testingMLParallelWasteHidden() throws IOException, ClassNotFoundException {
    //Set-up Flink session.
    Configuration conf = new Configuration();
    conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
    final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
            env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);


    // load the true WasteIncinerator Bayesian network
    BayesianNetwork wasteIncinerator = BayesianNetworkLoader.loadFromFile("../networks/simulated/WasteIncinerator.bn");
    wasteIncinerator.randomInitialization(new Random(0));
    if (Main.VERBOSE) System.out.println("\nAsia network \n ");
    //if (Main.VERBOSE) System.out.println(asianet.getDAG().outputString());
    if (Main.VERBOSE) System.out.println(wasteIncinerator.toString());

    //Sampling from WasteIncinerator BN
    BayesianNetworkSampler sampler = new BayesianNetworkSampler(wasteIncinerator);
    sampler.setSeed(0);
    //Load the sampled data
    DataStream<DataInstance> data = sampler.sampleToDataStream(1000);
    sampler.setHiddenVar(wasteIncinerator.getVariables().getVariableById(6));
    DataStreamWriter.writeDataToFile(data, "../datasets/simulated/tmp.arff");

    //We load the data
    DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFile(env, "../datasets/simulated/tmp.arff", false);


    //ParallelVB is defined
    dVMP parallelVB = new dVMP();
    parallelVB.setOutput(true);
    parallelVB.setSeed(5);
    parallelVB.setBatchSize(100);
    parallelVB.setLocalThreshold(0.001);
    parallelVB.setGlobalThreshold(0.001);
    parallelVB.setMaximumLocalIterations(100);
    parallelVB.setMaximumGlobalIterations(100);

    //Setting DAG
    parallelVB.setDAG(wasteIncinerator.getDAG());


    //Setting the distributed data source
    parallelVB.initLearning();
    parallelVB.updateModel(dataFlink);

    BayesianNetwork bnet = parallelVB.getLearntBayesianNetwork();

    if (Main.VERBOSE) System.out.println(bnet.toString());
}
 
Example 17
Source File: dVMPv1Test.java    From toolbox with Apache License 2.0 4 votes vote down vote up
public void testingMLParallelAsiaHidden() throws IOException, ClassNotFoundException {

        //Set-up Flink session.
        Configuration conf = new Configuration();
        conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
        final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
                env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

        // load the true Asia Bayesian network
        BayesianNetwork asianet = BayesianNetworkLoader.loadFromFile("../networks/dataWeka/asia.bn");
        asianet.randomInitialization(new Random(0));
        if (Main.VERBOSE) System.out.println("\nAsia network \n ");
        //if (Main.VERBOSE) System.out.println(asianet.getDAG().outputString());
        if (Main.VERBOSE) System.out.println(asianet.toString());

        //Sampling from Asia BN
        BayesianNetworkSampler sampler = new BayesianNetworkSampler(asianet);
        sampler.setSeed(0);
        //Load the sampled data
        DataStream<DataInstance> data = sampler.sampleToDataStream(10000);
        sampler.setHiddenVar(asianet.getVariables().getVariableById(7));
        DataStreamWriter.writeDataToFile(data, "../datasets/simulated/tmp.arff");

        DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFile(env, "../datasets/simulated/tmp.arff", false);

        //Structure learning is excluded from the test, i.e., we use directly the initial Asia network structure
        // and just learn then test the parameter learning

        long start = System.nanoTime();

        //Parameter Learning
        dVMPv1 parallelVB = new dVMPv1();
        parallelVB.setOutput(true);
        parallelVB.setSeed(5);
        parallelVB.setBatchSize(100);
        parallelVB.setLocalThreshold(0.001);
        parallelVB.setGlobalThreshold(0.05);
        parallelVB.setMaximumLocalIterations(100);
        parallelVB.setMaximumGlobalIterations(100);


        parallelVB.setDAG(asianet.getDAG());
        parallelVB.initLearning();
        parallelVB.updateModel(dataFlink);
        BayesianNetwork bnet = parallelVB.getLearntBayesianNetwork();

        if (Main.VERBOSE) System.out.println(bnet.toString());

        long duration = (System.nanoTime() - start) / 1;
        double seconds = duration / 1000000000.0;
        if (Main.VERBOSE) System.out.println("Running time: \n" + seconds + " secs");

    }
 
Example 18
Source File: ParallelVBTest.java    From toolbox with Apache License 2.0 4 votes vote down vote up
public void testingMLParallelAsiaHidden() throws IOException, ClassNotFoundException {

        //Set-up Flink session.
        Configuration conf = new Configuration();
        conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
        final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
                env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

        // load the true Asia Bayesian network
        BayesianNetwork asianet = BayesianNetworkLoader.loadFromFile("../networks/dataWeka/asia.bn");
        asianet.randomInitialization(new Random(0));
        if (Main.VERBOSE) System.out.println("\nAsia network \n ");
        //if (Main.VERBOSE) System.out.println(asianet.getDAG().outputString());
        if (Main.VERBOSE) System.out.println(asianet.toString());

        //Sampling from Asia BN
        BayesianNetworkSampler sampler = new BayesianNetworkSampler(asianet);
        sampler.setSeed(0);
        //Load the sampled data
        DataStream<DataInstance> data = sampler.sampleToDataStream(10000);
        sampler.setHiddenVar(asianet.getVariables().getVariableById(7));
        DataStreamWriter.writeDataToFile(data, "../datasets/simulated/tmp.arff");

        DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFile(env, "../datasets/simulated/tmp.arff", false);

        //Structure learning is excluded from the test, i.e., we use directly the initial Asia network structure
        // and just learn then test the parameter learning

        long start = System.nanoTime();

        //Parameter Learning
        ParallelVB parallelVB = new ParallelVB();
        parallelVB.setOutput(true);
        parallelVB.setSeed(5);
        parallelVB.setBatchSize(100);
        parallelVB.setLocalThreshold(0.001);
        parallelVB.setGlobalThreshold(0.05);
        parallelVB.setMaximumLocalIterations(100);
        parallelVB.setMaximumGlobalIterations(100);


        parallelVB.setDAG(asianet.getDAG());
        parallelVB.initLearning();
        parallelVB.updateModel(dataFlink);
        BayesianNetwork bnet = parallelVB.getLearntBayesianNetwork();

        if (Main.VERBOSE) System.out.println(bnet.toString());

        long duration = (System.nanoTime() - start) / 1;
        double seconds = duration / 1000000000.0;
        if (Main.VERBOSE) System.out.println("Running time: \n" + seconds + " secs");

    }
 
Example 19
Source File: dVMPv1Test.java    From toolbox with Apache License 2.0 4 votes vote down vote up
public void testParallelVBExtended() throws Exception {

        int nCVars = 50;//Integer.parseInt(args[0]);
        int nMVars = 50;//Integer.parseInt(args[1]);
        int nSamples = 1000;//Integer.parseInt(args[2]);
        int windowSize = 100;//Integer.parseInt(args[3]);
        int globalIter = 10;//Integer.parseInt(args[4]);
        int localIter = 100;//Integer.parseInt(args[5]);
        int seed = 0;//Integer.parseInt(args[6]);

        //Set-up Flink session.
        Configuration conf = new Configuration();
        conf.setInteger("taskmanager.network.numberOfBuffers", 12000);
        final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf);
                env.getConfig().disableSysoutLogging();         env.setParallelism(Main.PARALLELISM);

        /*
         * Logging
         */
        //BasicConfigurator.configure();
        //PropertyConfigurator.configure(args[7]);

        //String fileName = "hdfs:///tmp"+nCVars+"_"+nMVars+"_"+nSamples+"_"+windowSize+"_"+globalIter+"_"+localIter+".arff";
        String fileName = "../datasets/tmp"+nCVars+"_"+nMVars+"_"+nSamples+"_"+windowSize+"_"+globalIter+"_"+localIter+".arff";

        // Randomly generate the data stream using {@link BayesianNetworkGenerator} and {@link BayesianNetworkSampler}.
        BayesianNetworkGenerator.setSeed(seed);
        BayesianNetworkGenerator.setNumberOfGaussianVars(nCVars);
        BayesianNetworkGenerator.setNumberOfMultinomialVars(nMVars, 2);
        BayesianNetwork originalBnet  = BayesianNetworkGenerator.generateBayesianNetwork();

        //Sampling from Asia BN
        eu.amidst.flinklink.core.utils.BayesianNetworkSampler sampler = new eu.amidst.flinklink.core.utils.BayesianNetworkSampler(originalBnet);
        sampler.setSeed(seed);

        //Load the sampled data
        DataFlink<DataInstance> data = sampler.sampleToDataFlink(env,nSamples);

        DataFlinkWriter.writeDataToARFFFolder(data,fileName);

        DataFlink<DataInstance> dataFlink = DataFlinkLoader.loadDataFromFolder(env,fileName, false);

        DAG hiddenNB = getHiddenNaiveBayesStructure(dataFlink.getAttributes());


        //Structure learning is excluded from the test, i.e., we use directly the initial Asia network structure
        // and just learn then test the parameter learning

        long start = System.nanoTime();

        //Parameter Learning
        dVMPv1 parallelVB = new dVMPv1();
        parallelVB.setGlobalThreshold(0.1);
        parallelVB.setMaximumGlobalIterations(globalIter);
        parallelVB.setLocalThreshold(0.1);
        parallelVB.setMaximumLocalIterations(localIter);
        parallelVB.setSeed(5);

        //Set the window size
        parallelVB.setBatchSize(windowSize);


        parallelVB.setDAG(hiddenNB);
        parallelVB.initLearning();
        parallelVB.updateModel(dataFlink);
        BayesianNetwork LearnedBnet = parallelVB.getLearntBayesianNetwork();
        if (Main.VERBOSE) System.out.println(LearnedBnet.toString());

        long duration = (System.nanoTime() - start) / 1;
        double seconds = duration / 1000000000.0;
        //logger.info("Global ELBO: {}", parallelVB.getLogMarginalProbability());

    }
 
Example 20
Source File: DistinctTranslationTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void translateDistinctKeySelector() {
	try {
		final int parallelism = 8;
		ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(parallelism);

		DataSet<Tuple3<Double, StringValue, LongValue>> initialData = getSourceDataSet(env);

		initialData.distinct(new KeySelector<Tuple3<Double, StringValue, LongValue>, StringValue>() {
			public StringValue getKey(Tuple3<Double, StringValue, LongValue> value) {
				return value.f1;
			}
		}).setParallelism(4).output(new DiscardingOutputFormat<Tuple3<Double, StringValue, LongValue>>());

		Plan p = env.createProgramPlan();

		GenericDataSinkBase<?> sink = p.getDataSinks().iterator().next();

		MapOperatorBase<?, ?, ?> keyRemover = (MapOperatorBase<?, ?, ?>) sink.getInput();
		PlanUnwrappingReduceOperator<?, ?> reducer = (PlanUnwrappingReduceOperator<?, ?>) keyRemover.getInput();
		MapOperatorBase<?, ?, ?> keyExtractor = (MapOperatorBase<?, ?, ?>) reducer.getInput();

		// check the parallelisms
		assertEquals(1, keyExtractor.getParallelism());
		assertEquals(4, reducer.getParallelism());

		// check types
		TypeInformation<?> keyValueInfo = new TupleTypeInfo<Tuple2<StringValue, Tuple3<Double, StringValue, LongValue>>>(
				new ValueTypeInfo<StringValue>(StringValue.class),
				initialData.getType());

		assertEquals(initialData.getType(), keyExtractor.getOperatorInfo().getInputType());
		assertEquals(keyValueInfo, keyExtractor.getOperatorInfo().getOutputType());

		assertEquals(keyValueInfo, reducer.getOperatorInfo().getInputType());
		assertEquals(keyValueInfo, reducer.getOperatorInfo().getOutputType());

		assertEquals(keyValueInfo, keyRemover.getOperatorInfo().getInputType());
		assertEquals(initialData.getType(), keyRemover.getOperatorInfo().getOutputType());

		// check keys
		assertEquals(KeyExtractingMapper.class, keyExtractor.getUserCodeWrapper().getUserCodeClass());

		assertTrue(keyExtractor.getInput() instanceof GenericDataSourceBase<?, ?>);
	}
	catch (Exception e) {
		System.err.println(e.getMessage());
		e.printStackTrace();
		fail("Test caused an error: " + e.getMessage());
	}
}