Java Code Examples for org.apache.flink.api.java.DataSet#output()

The following examples show how to use org.apache.flink.api.java.DataSet#output() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ReduceOnNeighborsWithExceptionITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Test groupReduceOnNeighbors() -NeighborsFunction-
 * with an edge having a srcId that does not exist in the vertex DataSet.
 */
@Test
public void testGroupReduceOnNeighborsInvalidEdgeSrcId() throws Exception {

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(PARALLELISM);
	env.getConfig().disableSysoutLogging();

	Graph<Long, Long, Long> graph = Graph.fromDataSet(TestGraphUtils.getLongLongVertexData(env),
			TestGraphUtils.getLongLongEdgeInvalidTrgData(env), env);

	try {
		DataSet<Tuple2<Long, Long>> verticesWithSumOfAllNeighborValues =
				graph.reduceOnNeighbors(new SumNeighbors(), EdgeDirection.ALL);

		verticesWithSumOfAllNeighborValues.output(new DiscardingOutputFormat<>());
		env.execute();
	} catch (Exception e) {
		// We expect the job to fail with an exception
	}
}
 
Example 2
Source File: ReduceOnEdgesWithExceptionITCase.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
/**
 * Test groupReduceOnEdges() with an edge having a srcId that does not exist in the vertex DataSet.
 */
@Test
public void testGroupReduceOnEdgesInvalidEdgeSrcId() throws Exception {

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(PARALLELISM);
	env.getConfig().disableSysoutLogging();

	Graph<Long, Long, Long> graph = Graph.fromDataSet(TestGraphUtils.getLongLongVertexData(env),
			TestGraphUtils.getLongLongEdgeInvalidSrcData(env), env);

	try {
		DataSet<Tuple2<Long, Long>> verticesWithAllNeighbors =
				graph.groupReduceOnEdges(new SelectNeighborsValueGreaterThanFour(), EdgeDirection.ALL);

		verticesWithAllNeighbors.output(new DiscardingOutputFormat<>());
		env.execute();

		fail("Expected an exception.");
	} catch (Exception e) {
		// We expect the job to fail with an exception
	}
}
 
Example 3
Source File: ReduceOnEdgesWithExceptionITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Test groupReduceOnEdges() with an edge having a trgId that does not exist in the vertex DataSet.
 */
@Test
public void testGroupReduceOnEdgesInvalidEdgeTrgId() throws Exception {

	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(PARALLELISM);
	env.getConfig().disableSysoutLogging();

	Graph<Long, Long, Long> graph = Graph.fromDataSet(TestGraphUtils.getLongLongVertexData(env),
			TestGraphUtils.getLongLongEdgeInvalidTrgData(env), env);

	try {
		DataSet<Tuple2<Long, Long>> verticesWithAllNeighbors =
				graph.groupReduceOnEdges(new SelectNeighborsValueGreaterThanFour(), EdgeDirection.ALL);

		verticesWithAllNeighbors.output(new DiscardingOutputFormat<>());
		env.execute();

		fail("Expected an exception.");
	} catch (Exception e) {
		// We expect the job to fail with an exception
	}
}
 
Example 4
Source File: BranchingPlansCompilerTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testBranchAfterIteration() {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);
	DataSet<Long> sourceA = env.generateSequence(0,1);

	IterativeDataSet<Long> loopHead = sourceA.iterate(10);
	DataSet<Long> loopTail = loopHead.map(new IdentityMapper<Long>()).name("Mapper");
	DataSet<Long> loopRes = loopHead.closeWith(loopTail);

	loopRes.output(new DiscardingOutputFormat<Long>());
	loopRes.map(new IdentityMapper<Long>())
			.output(new DiscardingOutputFormat<Long>());

	Plan plan = env.createProgramPlan();

	try {
		compileNoStats(plan);
	}
	catch (Exception e) {
		e.printStackTrace();
		Assert.fail(e.getMessage());
	}
}
 
Example 5
Source File: MultipleJoinsWithSolutionSetCompilerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testMultiSolutionSetJoinPlan() {
	try {

		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		@SuppressWarnings("unchecked")
		DataSet<Tuple2<Long, Double>> inputData = env.fromElements(new Tuple2<Long, Double>(1L, 1.0));
		DataSet<Tuple2<Long, Double>> result = constructPlan(inputData, 10);

		// add two sinks, to test the case of branching after an iteration
		result.output(new DiscardingOutputFormat<Tuple2<Long, Double>>());
		result.output(new DiscardingOutputFormat<Tuple2<Long, Double>>());

		Plan p = env.createProgramPlan();

		OptimizedPlan optPlan = compileNoStats(p);

		OptimizerPlanNodeResolver or = getOptimizerPlanNodeResolver(optPlan);

		DualInputPlanNode join1 = or.getNode(JOIN_1);
		DualInputPlanNode join2 = or.getNode(JOIN_2);

		assertEquals(DriverStrategy.HYBRIDHASH_BUILD_FIRST, join1.getDriverStrategy());
		assertEquals(DriverStrategy.HYBRIDHASH_BUILD_SECOND, join2.getDriverStrategy());

		assertEquals(ShipStrategyType.PARTITION_HASH, join1.getInput2().getShipStrategy());
		assertEquals(ShipStrategyType.PARTITION_HASH, join2.getInput1().getShipStrategy());

		assertEquals(SolutionSetPlanNode.class, join1.getInput1().getSource().getClass());
		assertEquals(SolutionSetPlanNode.class, join2.getInput2().getSource().getClass());

		new JobGraphGenerator().compileJobGraph(optPlan);
	}
	catch (Exception e) {
		System.err.println(e.getMessage());
		e.printStackTrace();
		fail("Test erroneous: " + e.getMessage());
	}
}
 
Example 6
Source File: BranchingPlansCompilerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * <pre>
 *       (SRC A)         (SRC B)          (SRC C)
 *      /       \       /                /       \
 *  (SINK 1) (DELTA ITERATION)          |     (SINK 2)
 *             /    |   \               /
 *         (SINK 3) |   (CROSS => NEXT WORKSET)
 *                  |             |
 *                (JOIN => SOLUTION SET DELTA)
 * </pre>
 */
@Test
public void testClosureDeltaIteration() {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);
	DataSet<Tuple2<Long, Long>> sourceA = env.generateSequence(0,1).map(new Duplicator<Long>());
	DataSet<Tuple2<Long, Long>> sourceB = env.generateSequence(0,1).map(new Duplicator<Long>());
	DataSet<Tuple2<Long, Long>> sourceC = env.generateSequence(0,1).map(new Duplicator<Long>());

	sourceA.output(new DiscardingOutputFormat<Tuple2<Long,Long>>());
	sourceC.output(new DiscardingOutputFormat<Tuple2<Long,Long>>());

	DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> loop = sourceA.iterateDelta(sourceB, 10, 0);

	DataSet<Tuple2<Long, Long>> workset = loop.getWorkset().cross(sourceB).with(new IdentityCrosser<Tuple2<Long, Long>>()).name("Next work set");
	DataSet<Tuple2<Long, Long>> delta = workset.join(loop.getSolutionSet()).where(0).equalTo(0).with(new IdentityJoiner<Tuple2<Long, Long>>()).name("Solution set delta");

	DataSet<Tuple2<Long, Long>> result = loop.closeWith(delta, workset);
	result.output(new DiscardingOutputFormat<Tuple2<Long,Long>>());

	Plan plan = env.createProgramPlan();

	try{
		compileNoStats(plan);
	}catch(Exception e){
		e.printStackTrace();
		Assert.fail(e.getMessage());
	}
}
 
Example 7
Source File: SuccessAfterNetworkBuffersFailureITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private static void runKMeans(ExecutionEnvironment env) throws Exception {

		env.setParallelism(PARALLELISM);
		env.getConfig().disableSysoutLogging();

		// get input data
		DataSet<KMeans.Point> points =  KMeansData.getDefaultPointDataSet(env).rebalance();
		DataSet<KMeans.Centroid> centroids =  KMeansData.getDefaultCentroidDataSet(env).rebalance();

		// set number of bulk iterations for KMeans algorithm
		IterativeDataSet<KMeans.Centroid> loop = centroids.iterate(20);

		// add some re-partitions to increase network buffer use
		DataSet<KMeans.Centroid> newCentroids = points
				// compute closest centroid for each point
				.map(new KMeans.SelectNearestCenter()).withBroadcastSet(loop, "centroids")
				.rebalance()
				// count and sum point coordinates for each centroid
				.map(new KMeans.CountAppender())
				.groupBy(0).reduce(new KMeans.CentroidAccumulator())
				// compute new centroids from point counts and coordinate sums
				.rebalance()
				.map(new KMeans.CentroidAverager());

		// feed new centroids back into next iteration
		DataSet<KMeans.Centroid> finalCentroids = loop.closeWith(newCentroids);

		DataSet<Tuple2<Integer, KMeans.Point>> clusteredPoints = points
				// assign points to final clusters
				.map(new KMeans.SelectNearestCenter()).withBroadcastSet(finalCentroids, "centroids");

		clusteredPoints.output(new DiscardingOutputFormat<Tuple2<Integer, KMeans.Point>>());

		env.execute("KMeans Example");
	}
 
Example 8
Source File: MultipleJoinsWithSolutionSetCompilerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testMultiSolutionSetJoinPlan() {
	try {

		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		@SuppressWarnings("unchecked")
		DataSet<Tuple2<Long, Double>> inputData = env.fromElements(new Tuple2<Long, Double>(1L, 1.0));
		DataSet<Tuple2<Long, Double>> result = constructPlan(inputData, 10);

		// add two sinks, to test the case of branching after an iteration
		result.output(new DiscardingOutputFormat<Tuple2<Long, Double>>());
		result.output(new DiscardingOutputFormat<Tuple2<Long, Double>>());

		Plan p = env.createProgramPlan();

		OptimizedPlan optPlan = compileNoStats(p);

		OptimizerPlanNodeResolver or = getOptimizerPlanNodeResolver(optPlan);

		DualInputPlanNode join1 = or.getNode(JOIN_1);
		DualInputPlanNode join2 = or.getNode(JOIN_2);

		assertEquals(DriverStrategy.HYBRIDHASH_BUILD_FIRST, join1.getDriverStrategy());
		assertEquals(DriverStrategy.HYBRIDHASH_BUILD_SECOND, join2.getDriverStrategy());

		assertEquals(ShipStrategyType.PARTITION_HASH, join1.getInput2().getShipStrategy());
		assertEquals(ShipStrategyType.PARTITION_HASH, join2.getInput1().getShipStrategy());

		assertEquals(SolutionSetPlanNode.class, join1.getInput1().getSource().getClass());
		assertEquals(SolutionSetPlanNode.class, join2.getInput2().getSource().getClass());

		new JobGraphGenerator().compileJobGraph(optPlan);
	}
	catch (Exception e) {
		System.err.println(e.getMessage());
		e.printStackTrace();
		fail("Test erroneous: " + e.getMessage());
	}
}
 
Example 9
Source File: LargePlanTest.java    From flink with Apache License 2.0 5 votes vote down vote up
private static void runProgram(ExecutionEnvironment env, int depth, int width) throws Exception {
	DataSet<String> input = env.fromElements("a", "b", "c");
	DataSet<String> stats = null;

	for (int i = 0; i < depth; i++) {
		stats = analyze(input, stats, width / (i + 1) + 1);
	}

	stats.output(new DiscardingOutputFormat<>());
	env.execute("depth " + depth + " width " + width);
}
 
Example 10
Source File: IterationIncompleteDynamicPathConsumptionITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// the test data is constructed such that the merge join zig zag
	// has an early out, leaving elements on the dynamic path input unconsumed

	DataSet<Path> edges = env.fromElements(
			new Path(1, 2),
			new Path(1, 4),
			new Path(3, 6),
			new Path(3, 8),
			new Path(1, 10),
			new Path(1, 12),
			new Path(3, 14),
			new Path(3, 16),
			new Path(1, 18),
			new Path(1, 20));

	IterativeDataSet<Path> currentPaths = edges.iterate(10);

	DataSet<Path> newPaths = currentPaths
			.join(edges, JoinHint.REPARTITION_SORT_MERGE).where("to").equalTo("from")
				.with(new PathConnector())
			.union(currentPaths).distinct("from", "to");

	DataSet<Path> result = currentPaths.closeWith(newPaths);

	result.output(new DiscardingOutputFormat<Path>());

	env.execute();
}
 
Example 11
Source File: BranchingPlansCompilerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * <pre>
 *       (SRC A)         (SRC B)          (SRC C)
 *      /       \       /                /       \
 *  (SINK 1) (DELTA ITERATION)          |     (SINK 2)
 *             /    |   \               /
 *         (SINK 3) |   (CROSS => NEXT WORKSET)
 *                  |             |
 *                (JOIN => SOLUTION SET DELTA)
 * </pre>
 */
@Test
public void testClosureDeltaIteration() {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);
	DataSet<Tuple2<Long, Long>> sourceA = env.generateSequence(0,1).map(new Duplicator<Long>());
	DataSet<Tuple2<Long, Long>> sourceB = env.generateSequence(0,1).map(new Duplicator<Long>());
	DataSet<Tuple2<Long, Long>> sourceC = env.generateSequence(0,1).map(new Duplicator<Long>());

	sourceA.output(new DiscardingOutputFormat<Tuple2<Long,Long>>());
	sourceC.output(new DiscardingOutputFormat<Tuple2<Long,Long>>());

	DeltaIteration<Tuple2<Long, Long>, Tuple2<Long, Long>> loop = sourceA.iterateDelta(sourceB, 10, 0);

	DataSet<Tuple2<Long, Long>> workset = loop.getWorkset().cross(sourceB).with(new IdentityCrosser<Tuple2<Long, Long>>()).name("Next work set");
	DataSet<Tuple2<Long, Long>> delta = workset.join(loop.getSolutionSet()).where(0).equalTo(0).with(new IdentityJoiner<Tuple2<Long, Long>>()).name("Solution set delta");

	DataSet<Tuple2<Long, Long>> result = loop.closeWith(delta, workset);
	result.output(new DiscardingOutputFormat<Tuple2<Long,Long>>());

	Plan plan = env.createProgramPlan();

	try{
		compileNoStats(plan);
	}catch(Exception e){
		e.printStackTrace();
		Assert.fail(e.getMessage());
	}
}
 
Example 12
Source File: IterationIncompleteStaticPathConsumptionITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
protected void testProgram() throws Exception {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	// the test data is constructed such that the merge join zig zag
	// has an early out, leaving elements on the static path input unconsumed

	DataSet<Path> edges = env.fromElements(
			new Path(2, 1),
			new Path(4, 1),
			new Path(6, 3),
			new Path(8, 3),
			new Path(10, 1),
			new Path(12, 1),
			new Path(14, 3),
			new Path(16, 3),
			new Path(18, 1),
			new Path(20, 1));

	IterativeDataSet<Path> currentPaths = edges.iterate(10);

	DataSet<Path> newPaths = currentPaths
			.join(edges, JoinHint.REPARTITION_SORT_MERGE).where("to").equalTo("from")
				.with(new PathConnector())
			.union(currentPaths).distinct("from", "to");

	DataSet<Path> result = currentPaths.closeWith(newPaths);

	result.output(new DiscardingOutputFormat<Path>());

	env.execute();
}
 
Example 13
Source File: PregelCompilerTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("serial")
@Test
public void testPregelCompiler() {
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);
	// compose test program
	{

		DataSet<Vertex<Long, Long>> initialVertices = env.fromElements(
			new Tuple2<>(1L, 1L), new Tuple2<>(2L, 2L))
			.map(new Tuple2ToVertexMap<>());

		DataSet<Edge<Long, NullValue>> edges = env.fromElements(new Tuple2<>(1L, 2L))
			.map(new MapFunction<Tuple2<Long, Long>, Edge<Long, NullValue>>() {

				public Edge<Long, NullValue> map(Tuple2<Long, Long> edge) {
					return new Edge<>(edge.f0, edge.f1, NullValue.getInstance());
				}
			});

		Graph<Long, Long, NullValue> graph = Graph.fromDataSet(initialVertices, edges, env);

		DataSet<Vertex<Long, Long>> result = graph.runVertexCentricIteration(
			new CCCompute(), null, 100).getVertices();

		result.output(new DiscardingOutputFormat<>());
	}

	Plan p = env.createProgramPlan("Pregel Connected Components");
	OptimizedPlan op = compileNoStats(p);

	// check the sink
	SinkPlanNode sink = op.getDataSinks().iterator().next();
	assertEquals(ShipStrategyType.FORWARD, sink.getInput().getShipStrategy());
	assertEquals(DEFAULT_PARALLELISM, sink.getParallelism());

	// check the iteration
	WorksetIterationPlanNode iteration = (WorksetIterationPlanNode) sink.getInput().getSource();
	assertEquals(DEFAULT_PARALLELISM, iteration.getParallelism());

	// check the solution set delta
	PlanNode ssDelta = iteration.getSolutionSetDeltaPlanNode();
	assertTrue(ssDelta instanceof SingleInputPlanNode);

	SingleInputPlanNode ssFlatMap = (SingleInputPlanNode) ((SingleInputPlanNode) (ssDelta)).getInput().getSource();
	assertEquals(DEFAULT_PARALLELISM, ssFlatMap.getParallelism());
	assertEquals(ShipStrategyType.FORWARD, ssFlatMap.getInput().getShipStrategy());

	// check the computation coGroup
	DualInputPlanNode computationCoGroup = (DualInputPlanNode) (ssFlatMap.getInput().getSource());
	assertEquals(DEFAULT_PARALLELISM, computationCoGroup.getParallelism());
	assertEquals(ShipStrategyType.FORWARD, computationCoGroup.getInput1().getShipStrategy());
	assertEquals(ShipStrategyType.PARTITION_HASH, computationCoGroup.getInput2().getShipStrategy());
	assertTrue(computationCoGroup.getInput2().getTempMode().isCached());

	assertEquals(new FieldList(0), computationCoGroup.getInput2().getShipStrategyKeys());

	// check that the initial partitioning is pushed out of the loop
	assertEquals(ShipStrategyType.PARTITION_HASH, iteration.getInput1().getShipStrategy());
	assertEquals(new FieldList(0), iteration.getInput1().getShipStrategyKeys());
}
 
Example 14
Source File: CachedMatchStrategyCompilerTest.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
private Plan getTestPlanLeftStatic(String strategy) {
	
	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);
	
	@SuppressWarnings("unchecked")
	DataSet<Tuple3<Long, Long, Long>> bigInput = env.fromElements(new Tuple3<Long, Long, Long>(1L, 2L, 3L),
			new Tuple3<Long, Long, Long>(1L, 2L, 3L),new Tuple3<Long, Long, Long>(1L, 2L, 3L)).name("Big");
	
	@SuppressWarnings("unchecked")
	DataSet<Tuple3<Long, Long, Long>> smallInput = env.fromElements(new Tuple3<Long, Long, Long>(1L, 2L, 3L)).name("Small");
	
	IterativeDataSet<Tuple3<Long, Long, Long>> iteration = bigInput.iterate(10);
	
	Configuration joinStrategy = new Configuration();
	joinStrategy.setString(Optimizer.HINT_LOCAL_STRATEGY, strategy);
	
	DataSet<Tuple3<Long, Long, Long>> inner = smallInput.join(iteration).where(0).equalTo(0).with(new DummyJoiner()).name("DummyJoiner").withParameters(joinStrategy);

	DataSet<Tuple3<Long, Long, Long>> output = iteration.closeWith(inner);
	
	output.output(new DiscardingOutputFormat<Tuple3<Long,Long,Long>>());
	
	return env.createProgramPlan();
	
}
 
Example 15
Source File: SpargelCompilerTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@SuppressWarnings("serial")
@Test
public void testSpargelCompilerWithBroadcastVariable() {
	final String broadcastVariableName = "broadcast variable";

	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	// compose test program

	DataSet<Long> bcVar = env.fromElements(1L);

	DataSet<Vertex<Long, Long>> initialVertices = env.fromElements(
		new Tuple2<>(1L, 1L), new Tuple2<>(2L, 2L))
		.map(new Tuple2ToVertexMap<>());

	DataSet<Edge<Long, NullValue>> edges = env.fromElements(new Tuple2<>(1L, 2L))
		.map(new MapFunction<Tuple2<Long, Long>, Edge<Long, NullValue>>() {

			public Edge<Long, NullValue> map(Tuple2<Long, Long> edge) {
				return new Edge<>(edge.f0, edge.f1, NullValue.getInstance());
			}
		});

	Graph<Long, Long, NullValue> graph = Graph.fromDataSet(initialVertices, edges, env);

	ScatterGatherConfiguration parameters = new ScatterGatherConfiguration();
	parameters.addBroadcastSetForScatterFunction(broadcastVariableName, bcVar);
	parameters.addBroadcastSetForGatherFunction(broadcastVariableName, bcVar);

	DataSet<Vertex<Long, Long>> result = graph.runScatterGatherIteration(
		new ConnectedComponents.CCMessenger<>(BasicTypeInfo.LONG_TYPE_INFO),
		new ConnectedComponents.CCUpdater<>(), 100)
		.getVertices();

	result.output(new DiscardingOutputFormat<>());

	Plan p = env.createProgramPlan("Spargel Connected Components");
	OptimizedPlan op = compileNoStats(p);

	// check the sink
	SinkPlanNode sink = op.getDataSinks().iterator().next();
	assertEquals(ShipStrategyType.FORWARD, sink.getInput().getShipStrategy());
	assertEquals(DEFAULT_PARALLELISM, sink.getParallelism());

	// check the iteration
	WorksetIterationPlanNode iteration = (WorksetIterationPlanNode) sink.getInput().getSource();
	assertEquals(DEFAULT_PARALLELISM, iteration.getParallelism());

	// check the solution set join and the delta
	PlanNode ssDelta = iteration.getSolutionSetDeltaPlanNode();
	assertTrue(ssDelta instanceof DualInputPlanNode); // this is only true if the update functions preserves the partitioning

	DualInputPlanNode ssJoin = (DualInputPlanNode) ssDelta;
	assertEquals(DEFAULT_PARALLELISM, ssJoin.getParallelism());
	assertEquals(ShipStrategyType.PARTITION_HASH, ssJoin.getInput1().getShipStrategy());
	assertEquals(new FieldList(0), ssJoin.getInput1().getShipStrategyKeys());

	// check the workset set join
	DualInputPlanNode edgeJoin = (DualInputPlanNode) ssJoin.getInput1().getSource();
	assertEquals(DEFAULT_PARALLELISM, edgeJoin.getParallelism());
	assertEquals(ShipStrategyType.PARTITION_HASH, edgeJoin.getInput1().getShipStrategy());
	assertEquals(ShipStrategyType.FORWARD, edgeJoin.getInput2().getShipStrategy());
	assertTrue(edgeJoin.getInput1().getTempMode().isCached());

	assertEquals(new FieldList(0), edgeJoin.getInput1().getShipStrategyKeys());

	// check that the initial partitioning is pushed out of the loop
	assertEquals(ShipStrategyType.PARTITION_HASH, iteration.getInput1().getShipStrategy());
	assertEquals(ShipStrategyType.PARTITION_HASH, iteration.getInput2().getShipStrategy());
	assertEquals(new FieldList(0), iteration.getInput1().getShipStrategyKeys());
	assertEquals(new FieldList(0), iteration.getInput2().getShipStrategyKeys());
}
 
Example 16
Source File: FlinkFactDistinctColumns.java    From kylin with Apache License 2.0 4 votes vote down vote up
@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);
    int samplingPercent = Integer.parseInt(optionsHelper.getOptionValue(OPTION_STATS_SAMPLING_PERCENT));
    String enableObjectReuseOptValue = optionsHelper.getOptionValue(OPTION_ENABLE_OBJECT_REUSE);

    Job job = Job.getInstance();
    FileSystem fs = HadoopUtil.getWorkingFileSystem(job.getConfiguration());
    HadoopUtil.deletePath(job.getConfiguration(), new Path(outputPath));

    final SerializableConfiguration sConf = new SerializableConfiguration(job.getConfiguration());
    KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

    final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);

    final FactDistinctColumnsReducerMapping reducerMapping = new FactDistinctColumnsReducerMapping(cubeInstance);
    final int totalReducer = reducerMapping.getTotalReducerNum();

    logger.info("getTotalReducerNum: {}", totalReducer);
    logger.info("getCuboidRowCounterReducerNum: {}", reducerMapping.getCuboidRowCounterReducerNum());
    logger.info("counter path {}", counterPath);

    boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE.equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

    // calculate source record bytes size
    final String bytesWrittenName = "byte-writer-counter";
    final String recordCounterName = "record-counter";

    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    if (!StringUtil.isEmpty(enableObjectReuseOptValue) &&
            enableObjectReuseOptValue.equalsIgnoreCase("true")) {
        env.getConfig().enableObjectReuse();
    }

    DataSet<String[]> recordDataSet = FlinkUtil.readHiveRecords(isSequenceFile, env, inputPath, hiveTable, job);

    // read record from flat table
    // output:
    //   1, statistic
    //   2, field value of dict col
    //   3, min/max field value of not dict col
    DataSet<Tuple2<SelfDefineSortableKey, Text>> flatOutputDataSet = recordDataSet.mapPartition(
            new FlatOutputMapPartitionFunction(sConf, cubeName, segmentId, metaUrl, samplingPercent,
                    bytesWrittenName, recordCounterName));

    // repartition data, make each reducer handle only one col data or the statistic data
    DataSet<Tuple2<SelfDefineSortableKey, Text>> partitionDataSet = flatOutputDataSet
            .partitionCustom(new FactDistinctColumnPartitioner(cubeName, metaUrl, sConf), 0)
            .setParallelism(totalReducer);

    // multiple output result
    // 1, CFG_OUTPUT_COLUMN: field values of dict col, which will not be built in reducer, like globalDictCol
    // 2, CFG_OUTPUT_DICT: dictionary object built in reducer
    // 3, CFG_OUTPUT_STATISTICS: cube statistic: hll of cuboids ...
    // 4, CFG_OUTPUT_PARTITION: dimension value range(min,max)
    DataSet<Tuple2<String, Tuple3<Writable, Writable, String>>> outputDataSet = partitionDataSet
            .mapPartition(new MultiOutputMapPartitionFunction(sConf, cubeName, segmentId, metaUrl, samplingPercent))
            .setParallelism(totalReducer);

    // make each reducer output to respective dir
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_COLUMN, SequenceFileOutputFormat.class,
            NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
            NullWritable.class, ArrayPrimitiveWritable.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_STATISTICS, SequenceFileOutputFormat.class,
            LongWritable.class, BytesWritable.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_PARTITION, TextOutputFormat.class,
            NullWritable.class, LongWritable.class);

    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    FileOutputFormat.setCompressOutput(job, false);

    // prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    outputDataSet.output(new HadoopMultipleOutputFormat(new LazyOutputFormat(), job));

    JobExecutionResult jobExecutionResult =
            env.execute("Fact distinct columns for:" + cubeName + " segment " + segmentId);
    Map<String, Object> accumulatorResults = jobExecutionResult.getAllAccumulatorResults();
    Long recordCount = (Long) accumulatorResults.get(recordCounterName);
    Long bytesWritten = (Long) accumulatorResults.get(bytesWrittenName);
    logger.info("Map input records={}", recordCount);
    logger.info("HDFS Read: {} HDFS Write", bytesWritten);
    logger.info("HDFS: Number of bytes written=" + FlinkBatchCubingJobBuilder2.getFileSize(outputPath, fs));

    Map<String, String> counterMap = Maps.newHashMap();
    counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(recordCount));
    counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten));

    // save counter to hdfs
    HadoopUtil.writeToSequenceFile(job.getConfiguration(), counterPath, counterMap);
}
 
Example 17
Source File: IterationsCompilerTest.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
@Test
public void testTwoIterationsWithMapperInbetween() throws Exception {
	try {
		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(8);
		
		DataSet<Tuple2<Long, Long>> verticesWithInitialId = env.fromElements(new Tuple2<Long, Long>(1L, 2L));
		
		DataSet<Tuple2<Long, Long>> edges = env.fromElements(new Tuple2<Long, Long>(1L, 2L));
		
		DataSet<Tuple2<Long, Long>> bulkResult = doBulkIteration(verticesWithInitialId, edges);
		
		DataSet<Tuple2<Long, Long>> mappedBulk = bulkResult.map(new DummyMap());
		
		DataSet<Tuple2<Long, Long>> depResult = doDeltaIteration(mappedBulk, edges);
		
		depResult.output(new DiscardingOutputFormat<Tuple2<Long,Long>>());
		
		Plan p = env.createProgramPlan();
		OptimizedPlan op = compileNoStats(p);
		
		assertEquals(1, op.getDataSinks().size());
		assertTrue(op.getDataSinks().iterator().next().getInput().getSource() instanceof WorksetIterationPlanNode);
		
		WorksetIterationPlanNode wipn = (WorksetIterationPlanNode) op.getDataSinks().iterator().next().getInput().getSource();
		
		assertEquals(ShipStrategyType.PARTITION_HASH, wipn.getInput1().getShipStrategy());
		
		assertEquals(TempMode.NONE, wipn.getInput1().getTempMode());
		assertEquals(TempMode.NONE, wipn.getInput2().getTempMode());

		assertEquals(DataExchangeMode.BATCH, wipn.getInput1().getDataExchangeMode());
		assertEquals(DataExchangeMode.BATCH, wipn.getInput2().getDataExchangeMode());
		
		new JobGraphGenerator().compileJobGraph(op);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}
 
Example 18
Source File: JDBCAppendTableSink.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
public void emitDataSet(DataSet<Row> dataSet) {
	dataSet.output(outputFormat);
}
 
Example 19
Source File: IterationsCompilerTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testTwoIterationsDirectlyChained() throws Exception {
	try {
		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(8);
		
		DataSet<Tuple2<Long, Long>> verticesWithInitialId = env.fromElements(new Tuple2<Long, Long>(1L, 2L));
		
		DataSet<Tuple2<Long, Long>> edges = env.fromElements(new Tuple2<Long, Long>(1L, 2L));
		
		DataSet<Tuple2<Long, Long>> bulkResult = doBulkIteration(verticesWithInitialId, edges);
		
		DataSet<Tuple2<Long, Long>> depResult = doDeltaIteration(bulkResult, edges);
		
		depResult.output(new DiscardingOutputFormat<Tuple2<Long, Long>>());
		
		Plan p = env.createProgramPlan();
		OptimizedPlan op = compileNoStats(p);
		
		assertEquals(1, op.getDataSinks().size());
		assertTrue(op.getDataSinks().iterator().next().getInput().getSource() instanceof WorksetIterationPlanNode);
		
		WorksetIterationPlanNode wipn = (WorksetIterationPlanNode) op.getDataSinks().iterator().next().getInput().getSource();
		BulkIterationPlanNode bipn = (BulkIterationPlanNode)wipn.getInput1().getSource();

		// the hash partitioning has been pushed out of the delta iteration into the bulk iteration
		assertEquals(ShipStrategyType.FORWARD, wipn.getInput1().getShipStrategy());

		// the input of the root step function is the last operator of the step function
		// since the work has been pushed out of the bulk iteration, it has to guarantee the hash partitioning
		for (Channel c : bipn.getRootOfStepFunction().getInputs()) {
			assertEquals(ShipStrategyType.PARTITION_HASH, c.getShipStrategy());
		}

		assertEquals(DataExchangeMode.BATCH, wipn.getInput1().getDataExchangeMode());
		assertEquals(DataExchangeMode.BATCH, wipn.getInput2().getDataExchangeMode());
		
		assertEquals(TempMode.NONE, wipn.getInput1().getTempMode());
		assertEquals(TempMode.NONE, wipn.getInput2().getTempMode());
		
		new JobGraphGenerator().compileJobGraph(op);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}
 
Example 20
Source File: JdbcFullTest.java    From flink with Apache License 2.0 4 votes vote down vote up
private void runTest(boolean exploitParallelism) throws Exception {
	ExecutionEnvironment environment = ExecutionEnvironment.getExecutionEnvironment();
	JdbcInputFormat.JdbcInputFormatBuilder inputBuilder = JdbcInputFormat.buildJdbcInputFormat()
			.setDrivername(getDbMetadata().getDriverClass())
			.setDBUrl(getDbMetadata().getUrl())
			.setQuery(SELECT_ALL_BOOKS)
			.setRowTypeInfo(ROW_TYPE_INFO);

	if (exploitParallelism) {
		final int fetchSize = 1;
		final long min = TEST_DATA[0].id;
		final long max = TEST_DATA[TEST_DATA.length - fetchSize].id;
		//use a "splittable" query to exploit parallelism
		inputBuilder = inputBuilder
				.setQuery(SELECT_ALL_BOOKS_SPLIT_BY_ID)
				.setParametersProvider(new JdbcNumericBetweenParametersProvider(min, max).ofBatchSize(fetchSize));
	}
	DataSet<Row> source = environment.createInput(inputBuilder.finish());

	//NOTE: in this case (with Derby driver) setSqlTypes could be skipped, but
	//some databases don't null values correctly when no column type was specified
	//in PreparedStatement.setObject (see its javadoc for more details)
	JdbcConnectionOptions connectionOptions = new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
		.withUrl(getDbMetadata().getUrl())
		.withDriverName(getDbMetadata().getDriverClass())
		.build();

	JdbcBatchingOutputFormat jdbcOutputFormat = new JdbcBatchingOutputFormat<>(
		new SimpleJdbcConnectionProvider(connectionOptions),
		JdbcExecutionOptions.defaults(),
		ctx -> createSimpleRowExecutor(
			String.format(INSERT_TEMPLATE, OUTPUT_TABLE),
			new int[]{Types.INTEGER, Types.VARCHAR, Types.VARCHAR, Types.DOUBLE, Types.INTEGER},
			ctx.getExecutionConfig().isObjectReuseEnabled()),
		JdbcBatchingOutputFormat.RecordExtractor.identity()
	);

	source.output(jdbcOutputFormat);
	environment.execute();

	try (
		Connection dbConn = DriverManager.getConnection(getDbMetadata().getUrl());
		PreparedStatement statement = dbConn.prepareStatement(SELECT_ALL_NEWBOOKS);
		ResultSet resultSet = statement.executeQuery()
	) {
		int count = 0;
		while (resultSet.next()) {
			count++;
		}
		Assert.assertEquals(TEST_DATA.length, count);
	}
}