Java Code Examples for org.apache.flink.api.java.DataSet#union()

The following examples show how to use org.apache.flink.api.java.DataSet#union() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: UnionITCase.java From Flink-CEPplus with Apache License 2.0

6 votes

@Test
public void testUnion2IdenticalDataSets() throws Exception {
	/*
	 * Union of 2 Same Data Sets
	 */
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> unionDs = ds.union(CollectionDataSets.get3TupleDataSet(env));

	List<Tuple3<Integer, Long, String>> result = unionDs.collect();

	String expected = FULL_TUPLE_3_STRING + FULL_TUPLE_3_STRING;

	compareResultAsTuples(result, expected);
}

Example 2

Source File: ReduceITCase.java From flink with Apache License 2.0

6 votes

@Test
public void testSupportForDataAndEnumSerialization() throws Exception {
	/**
	 * Test support for Date and enum serialization
	 */
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSet<PojoWithDateAndEnum> ds = env.generateSequence(0, 2).map(new Mapper1());
	ds = ds.union(CollectionDataSets.getPojoWithDateAndEnum(env));

	DataSet<String> res = ds.groupBy("group").reduceGroup(new GroupReducer1());

	List<String> result = res.collect();

	String expected = "ok\nok";

	compareResultAsText(result, expected);
}

Example 3

Source File: UnionReplacementTest.java From flink with Apache License 2.0

6 votes

@Test
public void testUnionReplacement() {
	try {
		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
		DataSet<String> input1 = env.fromElements("test1");
		DataSet<String> input2 = env.fromElements("test2");

		DataSet<String> union = input1.union(input2);

		union.output(new DiscardingOutputFormat<String>());
		union.output(new DiscardingOutputFormat<String>());

		Plan plan = env.createProgramPlan();
		OptimizedPlan oPlan = compileNoStats(plan);
		JobGraphGenerator jobGen = new JobGraphGenerator();
		jobGen.compileJobGraph(oPlan);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example 4

Source File: UnionReplacementTest.java From flink with Apache License 2.0

6 votes

@Test
public void testUnionReplacement() {
	try {
		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
		DataSet<String> input1 = env.fromElements("test1");
		DataSet<String> input2 = env.fromElements("test2");

		DataSet<String> union = input1.union(input2);

		union.output(new DiscardingOutputFormat<String>());
		union.output(new DiscardingOutputFormat<String>());

		Plan plan = env.createProgramPlan();
		OptimizedPlan oPlan = compileNoStats(plan);
		JobGraphGenerator jobGen = new JobGraphGenerator();
		jobGen.compileJobGraph(oPlan);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example 5

Source File: UnionITCase.java From flink with Apache License 2.0

6 votes

@Test
public void testUnion2IdenticalDataSets() throws Exception {
	/*
	 * Union of 2 Same Data Sets
	 */
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> unionDs = ds.union(CollectionDataSets.get3TupleDataSet(env));

	List<Tuple3<Integer, Long, String>> result = unionDs.collect();

	String expected = FULL_TUPLE_3_STRING + FULL_TUPLE_3_STRING;

	compareResultAsTuples(result, expected);
}

Example 6

Source File: UnionITCase.java From flink with Apache License 2.0

6 votes

@Test
public void testUnion2IdenticalDataSets() throws Exception {
	/*
	 * Union of 2 Same Data Sets
	 */
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

	DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
	DataSet<Tuple3<Integer, Long, String>> unionDs = ds.union(CollectionDataSets.get3TupleDataSet(env));

	List<Tuple3<Integer, Long, String>> result = unionDs.collect();

	String expected = FULL_TUPLE_3_STRING + FULL_TUPLE_3_STRING;

	compareResultAsTuples(result, expected);
}

Example 7

Source File: ReduceITCase.java From flink with Apache License 2.0

6 votes

@Test
public void testSupportForDataAndEnumSerialization() throws Exception {
	/**
	 * Test support for Date and enum serialization
	 */
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSet<PojoWithDateAndEnum> ds = env.generateSequence(0, 2).map(new Mapper1());
	ds = ds.union(CollectionDataSets.getPojoWithDateAndEnum(env));

	DataSet<String> res = ds.groupBy("group").reduceGroup(new GroupReducer1());

	List<String> result = res.collect();

	String expected = "ok\nok";

	compareResultAsText(result, expected);
}

Example 8

Source File: UnionReplacementTest.java From Flink-CEPplus with Apache License 2.0

6 votes

@Test
public void testUnionReplacement() {
	try {
		ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
		DataSet<String> input1 = env.fromElements("test1");
		DataSet<String> input2 = env.fromElements("test2");

		DataSet<String> union = input1.union(input2);

		union.output(new DiscardingOutputFormat<String>());
		union.output(new DiscardingOutputFormat<String>());

		Plan plan = env.createProgramPlan();
		OptimizedPlan oPlan = compileNoStats(plan);
		JobGraphGenerator jobGen = new JobGraphGenerator();
		jobGen.compileJobGraph(oPlan);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example 9

Source File: LargePlanTest.java From flink with Apache License 2.0

5 votes

private static DataSet<String> analyze(DataSet<String> input, DataSet<String> stats, int branches) {
	for (int i = 0; i < branches; i++) {
		final int ii = i;

		if (stats != null) {
			input = input.map(
				new RichMapFunction<String, String>() {
					@Override
					public String map(String value) {
						return value;
					}
			}).withBroadcastSet(stats.map(s -> "(" + s + ").map"), "stats");
		}

		DataSet<String> branch = input
			.map(s -> new Tuple2<>(0, s + ii)).returns(Types.TUPLE(Types.STRING, Types.INT))
			.groupBy(0)
			.minBy(1)
			.map(kv -> kv.f1).returns(Types.STRING);
		if (stats == null) {
			stats = branch;
		} else {
			stats = stats.union(branch);
		}
	}
	return stats.map(s -> "(" + s + ").stats");
}

Example 10

Source File: BootstrapDatasetUnion.java From stateful-functions with Apache License 2.0

5 votes

private static DataSet<TaggedBootstrapData> unionTaggedBootstrapDataSets(
    List<DataSet<TaggedBootstrapData>> taggedBootstrapDatasets) {
  DataSet<TaggedBootstrapData> result = null;
  for (DataSet<TaggedBootstrapData> taggedBootstrapDataDataset : taggedBootstrapDatasets) {
    if (result != null) {
      result = result.union(taggedBootstrapDataDataset);
    } else {
      result = taggedBootstrapDataDataset;
    }
  }

  return result;
}

Example 11

Source File: BootstrapDatasetUnion.java From flink-statefun with Apache License 2.0

5 votes

private static DataSet<TaggedBootstrapData> unionTaggedBootstrapDataSets(
    List<DataSet<TaggedBootstrapData>> taggedBootstrapDatasets) {
  DataSet<TaggedBootstrapData> result = null;
  for (DataSet<TaggedBootstrapData> taggedBootstrapDataDataset : taggedBootstrapDatasets) {
    if (result != null) {
      result = result.union(taggedBootstrapDataDataset);
    } else {
      result = taggedBootstrapDataDataset;
    }
  }

  return result;
}

Example 12

Source File: FlinkBatchPortablePipelineTranslator.java From beam with Apache License 2.0

5 votes

private static <T> void translateFlatten(
    PTransformNode transform, RunnerApi.Pipeline pipeline, BatchTranslationContext context) {
  Map<String, String> allInputs = transform.getTransform().getInputsMap();
  DataSet<WindowedValue<T>> result = null;

  if (allInputs.isEmpty()) {

    // Create an empty dummy source to satisfy downstream operations. We cannot create an empty
    // source in Flink, so we send the DataSet to a flatMap that never forwards its element.
    DataSource<String> dummySource = context.getExecutionEnvironment().fromElements("dummy");
    result =
        dummySource
            .<WindowedValue<T>>flatMap(
                (s, collector) -> {
                  // never return anything
                })
            .returns(
                new CoderTypeInformation<>(
                    WindowedValue.getFullCoder(
                        (Coder<T>) VoidCoder.of(), GlobalWindow.Coder.INSTANCE)));
  } else {
    for (String pCollectionId : allInputs.values()) {
      DataSet<WindowedValue<T>> current = context.getDataSetOrThrow(pCollectionId);
      if (result == null) {
        result = current;
      } else {
        result = result.union(current);
      }
    }
  }

  // Insert a dummy filter. Flink produces duplicate elements after the union in some cases if we
  // don't do so.
  result = result.filter(tWindowedValue -> true).name("UnionFixFilter");
  context.addDataSet(
      Iterables.getOnlyElement(transform.getTransform().getOutputsMap().values()), result);
}

Example 13

Source File: WritableSavepoint.java From flink with Apache License 2.0

5 votes

private DataSet<OperatorState> unionOperatorStates(DataSet<OperatorState> newOperatorStates, List<OperatorState> existingOperators) {
	DataSet<OperatorState> finalOperatorStates;
	if (existingOperators.isEmpty()) {
		finalOperatorStates = newOperatorStates;
	} else {
		DataSet<OperatorState> wrappedCollection = newOperatorStates
			.getExecutionEnvironment()
			.fromCollection(existingOperators);

		finalOperatorStates = newOperatorStates.union(wrappedCollection);
	}
	return finalOperatorStates;
}

Example 14

Source File: FlinkBatchTransformTranslators.java From flink-dataflow with Apache License 2.0

5 votes

@Override
public void translateNode(Flatten.FlattenPCollectionList<T> transform, FlinkBatchTranslationContext context) {
	List<PCollection<T>> allInputs = context.getInput(transform).getAll();
	DataSet<T> result = null;
	for(PCollection<T> collection : allInputs) {
		DataSet<T> current = context.getInputDataSet(collection);
		if (result == null) {
			result = current;
		} else {
			result = result.union(current);
		}
	}
	context.setOutputDataSet(context.getOutput(transform), result);
}

Example 15

Source File: LargePlanTest.java From flink with Apache License 2.0

5 votes

private static DataSet<String> analyze(DataSet<String> input, DataSet<String> stats, int branches) {
	for (int i = 0; i < branches; i++) {
		final int ii = i;

		if (stats != null) {
			input = input.map(
				new RichMapFunction<String, String>() {
					@Override
					public String map(String value) {
						return value;
					}
			}).withBroadcastSet(stats.map(s -> "(" + s + ").map"), "stats");
		}

		DataSet<String> branch = input
			.map(s -> new Tuple2<>(0, s + ii)).returns(Types.TUPLE(Types.STRING, Types.INT))
			.groupBy(0)
			.minBy(1)
			.map(kv -> kv.f1).returns(Types.STRING);
		if (stats == null) {
			stats = branch;
		} else {
			stats = stats.union(branch);
		}
	}
	return stats.map(s -> "(" + s + ").stats");
}

Example 16

Source File: UnionReplacementTest.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Test the input and output shipping strategies for union operators with input and output
 * operators with different parallelisms.
 *
 * Src1 - Map(fullP) -\-/- Union - Map(fullP) - Out
 *                     X
 * Src2 - Map(halfP) -/-\- Union - Map(halfP) - Out
 *
 * The union operator must always have the same parallelism as its successor and connect to it
 * with a FORWARD strategy.
 * In this program, the input connections for union should be FORWARD for parallelism-preserving
 * connections and PARTITION_RANDOM for parallelism-changing connections.
 *
 */
@Test
public void testUnionInputOutputDifferentDOP() throws Exception {

	int fullDop = DEFAULT_PARALLELISM;
	int halfDop = DEFAULT_PARALLELISM / 2;

	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	DataSet<Tuple2<Long, Long>> in1 = env.fromElements(new Tuple2<>(0L, 0L))
		.map(new IdentityMapper<>()).setParallelism(fullDop).name("inDopFull");
	DataSet<Tuple2<Long, Long>> in2 = env.fromElements(new Tuple2<>(0L, 0L))
		.map(new IdentityMapper<>()).setParallelism(halfDop).name("inDopHalf");

	DataSet<Tuple2<Long, Long>> union = in1.union(in2);

	DataSet<Tuple2<Long, Long>> dopFullMap = union
		.map(new IdentityMapper<>()).setParallelism(fullDop).name("outDopFull");
	DataSet<Tuple2<Long, Long>> dopHalfMap = union
		.map(new IdentityMapper<>()).setParallelism(halfDop).name("outDopHalf");

	dopFullMap.output(new DiscardingOutputFormat<>());
	dopHalfMap.output(new DiscardingOutputFormat<>());

	// -----------------------------------------------------------------------------------------
	// Verify optimized plan
	// -----------------------------------------------------------------------------------------

	OptimizedPlan optimizedPlan = compileNoStats(env.createProgramPlan());

	OptimizerPlanNodeResolver resolver = getOptimizerPlanNodeResolver(optimizedPlan);

	SingleInputPlanNode inDopFull = resolver.getNode("inDopFull");
	SingleInputPlanNode inDopHalf = resolver.getNode("inDopHalf");
	SingleInputPlanNode outDopFull = resolver.getNode("outDopFull");
	SingleInputPlanNode outDopHalf = resolver.getNode("outDopHalf");
	NAryUnionPlanNode unionDopFull = (NAryUnionPlanNode) outDopFull.getInput().getSource();
	NAryUnionPlanNode unionDopHalf = (NAryUnionPlanNode) outDopHalf.getInput().getSource();

	// check in map nodes
	assertEquals(2, inDopFull.getOutgoingChannels().size());
	assertEquals(2, inDopHalf.getOutgoingChannels().size());
	assertEquals(fullDop, inDopFull.getParallelism());
	assertEquals(halfDop, inDopHalf.getParallelism());

	// check union nodes
	assertEquals(fullDop, unionDopFull.getParallelism());
	assertEquals(halfDop, unionDopHalf.getParallelism());

	// check out map nodes
	assertEquals(fullDop, outDopFull.getParallelism());
	assertEquals(halfDop, outDopHalf.getParallelism());

	// check Union -> outMap ship strategies
	assertEquals(ShipStrategyType.FORWARD, outDopHalf.getInput().getShipStrategy());
	assertEquals(ShipStrategyType.FORWARD, outDopFull.getInput().getShipStrategy());

	// check inMap -> Union ship strategies
	Channel fullFull;
	Channel fullHalf;
	Channel halfFull;
	Channel halfHalf;

	if (inDopFull.getOutgoingChannels().get(0).getTarget() == unionDopFull) {
		fullFull = inDopFull.getOutgoingChannels().get(0);
		fullHalf = inDopFull.getOutgoingChannels().get(1);
	} else {
		fullFull = inDopFull.getOutgoingChannels().get(1);
		fullHalf = inDopFull.getOutgoingChannels().get(0);
	}
	if (inDopHalf.getOutgoingChannels().get(0).getTarget() == unionDopFull) {
		halfFull = inDopHalf.getOutgoingChannels().get(0);
		halfHalf = inDopHalf.getOutgoingChannels().get(1);
	} else {
		halfFull = inDopHalf.getOutgoingChannels().get(1);
		halfHalf = inDopHalf.getOutgoingChannels().get(0);
	}

	assertEquals(ShipStrategyType.FORWARD, fullFull.getShipStrategy());
	assertEquals(ShipStrategyType.FORWARD, halfHalf.getShipStrategy());
	assertEquals(ShipStrategyType.PARTITION_RANDOM, fullHalf.getShipStrategy());
	assertEquals(ShipStrategyType.PARTITION_RANDOM, halfFull.getShipStrategy());
}

Example 17

Source File: UnionReplacementTest.java From flink with Apache License 2.0

4 votes

/**
 * Test the input and output shipping strategies for union operators with input and output
 * operators with different parallelisms.
 *
 * Src1 - Map(fullP) -\-/- Union - Map(fullP) - Out
 *                     X
 * Src2 - Map(halfP) -/-\- Union - Map(halfP) - Out
 *
 * The union operator must always have the same parallelism as its successor and connect to it
 * with a FORWARD strategy.
 * In this program, the input connections for union should be FORWARD for parallelism-preserving
 * connections and PARTITION_RANDOM for parallelism-changing connections.
 *
 */
@Test
public void testUnionInputOutputDifferentDOP() throws Exception {

	int fullDop = DEFAULT_PARALLELISM;
	int halfDop = DEFAULT_PARALLELISM / 2;

	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	DataSet<Tuple2<Long, Long>> in1 = env.fromElements(new Tuple2<>(0L, 0L))
		.map(new IdentityMapper<>()).setParallelism(fullDop).name("inDopFull");
	DataSet<Tuple2<Long, Long>> in2 = env.fromElements(new Tuple2<>(0L, 0L))
		.map(new IdentityMapper<>()).setParallelism(halfDop).name("inDopHalf");

	DataSet<Tuple2<Long, Long>> union = in1.union(in2);

	DataSet<Tuple2<Long, Long>> dopFullMap = union
		.map(new IdentityMapper<>()).setParallelism(fullDop).name("outDopFull");
	DataSet<Tuple2<Long, Long>> dopHalfMap = union
		.map(new IdentityMapper<>()).setParallelism(halfDop).name("outDopHalf");

	dopFullMap.output(new DiscardingOutputFormat<>());
	dopHalfMap.output(new DiscardingOutputFormat<>());

	// -----------------------------------------------------------------------------------------
	// Verify optimized plan
	// -----------------------------------------------------------------------------------------

	OptimizedPlan optimizedPlan = compileNoStats(env.createProgramPlan());

	OptimizerPlanNodeResolver resolver = getOptimizerPlanNodeResolver(optimizedPlan);

	SingleInputPlanNode inDopFull = resolver.getNode("inDopFull");
	SingleInputPlanNode inDopHalf = resolver.getNode("inDopHalf");
	SingleInputPlanNode outDopFull = resolver.getNode("outDopFull");
	SingleInputPlanNode outDopHalf = resolver.getNode("outDopHalf");
	NAryUnionPlanNode unionDopFull = (NAryUnionPlanNode) outDopFull.getInput().getSource();
	NAryUnionPlanNode unionDopHalf = (NAryUnionPlanNode) outDopHalf.getInput().getSource();

	// check in map nodes
	assertEquals(2, inDopFull.getOutgoingChannels().size());
	assertEquals(2, inDopHalf.getOutgoingChannels().size());
	assertEquals(fullDop, inDopFull.getParallelism());
	assertEquals(halfDop, inDopHalf.getParallelism());

	// check union nodes
	assertEquals(fullDop, unionDopFull.getParallelism());
	assertEquals(halfDop, unionDopHalf.getParallelism());

	// check out map nodes
	assertEquals(fullDop, outDopFull.getParallelism());
	assertEquals(halfDop, outDopHalf.getParallelism());

	// check Union -> outMap ship strategies
	assertEquals(ShipStrategyType.FORWARD, outDopHalf.getInput().getShipStrategy());
	assertEquals(ShipStrategyType.FORWARD, outDopFull.getInput().getShipStrategy());

	// check inMap -> Union ship strategies
	Channel fullFull;
	Channel fullHalf;
	Channel halfFull;
	Channel halfHalf;

	if (inDopFull.getOutgoingChannels().get(0).getTarget() == unionDopFull) {
		fullFull = inDopFull.getOutgoingChannels().get(0);
		fullHalf = inDopFull.getOutgoingChannels().get(1);
	} else {
		fullFull = inDopFull.getOutgoingChannels().get(1);
		fullHalf = inDopFull.getOutgoingChannels().get(0);
	}
	if (inDopHalf.getOutgoingChannels().get(0).getTarget() == unionDopFull) {
		halfFull = inDopHalf.getOutgoingChannels().get(0);
		halfHalf = inDopHalf.getOutgoingChannels().get(1);
	} else {
		halfFull = inDopHalf.getOutgoingChannels().get(1);
		halfHalf = inDopHalf.getOutgoingChannels().get(0);
	}

	assertEquals(ShipStrategyType.FORWARD, fullFull.getShipStrategy());
	assertEquals(ShipStrategyType.FORWARD, halfHalf.getShipStrategy());
	assertEquals(ShipStrategyType.PARTITION_RANDOM, fullHalf.getShipStrategy());
	assertEquals(ShipStrategyType.PARTITION_RANDOM, halfFull.getShipStrategy());
}

Example 18

Source File: WordCount.java From flink with Apache License 2.0

4 votes

public static void main(String[] args) throws Exception {

		final MultipleParameterTool params = MultipleParameterTool.fromArgs(args);

		// set up the execution environment
		final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

		// make parameters available in the web interface
		env.getConfig().setGlobalJobParameters(params);

		// get input data
		DataSet<String> text = null;
		if (params.has("input")) {
			// union all the inputs from text files
			for (String input : params.getMultiParameterRequired("input")) {
				if (text == null) {
					text = env.readTextFile(input);
				} else {
					text = text.union(env.readTextFile(input));
				}
			}
			Preconditions.checkNotNull(text, "Input DataSet should not be null.");
		} else {
			// get default test text data
			System.out.println("Executing WordCount example with default input data set.");
			System.out.println("Use --input to specify file input.");
			text = WordCountData.getDefaultTextLineDataSet(env);
		}

		DataSet<Tuple2<String, Integer>> counts =
				// split up the lines in pairs (2-tuples) containing: (word,1)
				text.flatMap(new Tokenizer())
				// group by the tuple field "0" and sum up tuple field "1"
				.groupBy(0)
				.sum(1);

		// emit result
		if (params.has("output")) {
			counts.writeAsCsv(params.get("output"), "\n", " ");
			// execute program
			env.execute("WordCount Example");
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			counts.print();
		}

	}

Example 19

Source File: UnionReplacementTest.java From flink with Apache License 2.0

4 votes

/**
 * Test the input and output shipping strategies for union operators with input and output
 * operators with different parallelisms.
 *
 * Src1 - Map(fullP) -\-/- Union - Map(fullP) - Out
 *                     X
 * Src2 - Map(halfP) -/-\- Union - Map(halfP) - Out
 *
 * The union operator must always have the same parallelism as its successor and connect to it
 * with a FORWARD strategy.
 * In this program, the input connections for union should be FORWARD for parallelism-preserving
 * connections and PARTITION_RANDOM for parallelism-changing connections.
 *
 */
@Test
public void testUnionInputOutputDifferentDOP() throws Exception {

	int fullDop = DEFAULT_PARALLELISM;
	int halfDop = DEFAULT_PARALLELISM / 2;

	ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(DEFAULT_PARALLELISM);

	DataSet<Tuple2<Long, Long>> in1 = env.fromElements(new Tuple2<>(0L, 0L))
		.map(new IdentityMapper<>()).setParallelism(fullDop).name("inDopFull");
	DataSet<Tuple2<Long, Long>> in2 = env.fromElements(new Tuple2<>(0L, 0L))
		.map(new IdentityMapper<>()).setParallelism(halfDop).name("inDopHalf");

	DataSet<Tuple2<Long, Long>> union = in1.union(in2);

	DataSet<Tuple2<Long, Long>> dopFullMap = union
		.map(new IdentityMapper<>()).setParallelism(fullDop).name("outDopFull");
	DataSet<Tuple2<Long, Long>> dopHalfMap = union
		.map(new IdentityMapper<>()).setParallelism(halfDop).name("outDopHalf");

	dopFullMap.output(new DiscardingOutputFormat<>());
	dopHalfMap.output(new DiscardingOutputFormat<>());

	// -----------------------------------------------------------------------------------------
	// Verify optimized plan
	// -----------------------------------------------------------------------------------------

	OptimizedPlan optimizedPlan = compileNoStats(env.createProgramPlan());

	OptimizerPlanNodeResolver resolver = getOptimizerPlanNodeResolver(optimizedPlan);

	SingleInputPlanNode inDopFull = resolver.getNode("inDopFull");
	SingleInputPlanNode inDopHalf = resolver.getNode("inDopHalf");
	SingleInputPlanNode outDopFull = resolver.getNode("outDopFull");
	SingleInputPlanNode outDopHalf = resolver.getNode("outDopHalf");
	NAryUnionPlanNode unionDopFull = (NAryUnionPlanNode) outDopFull.getInput().getSource();
	NAryUnionPlanNode unionDopHalf = (NAryUnionPlanNode) outDopHalf.getInput().getSource();

	// check in map nodes
	assertEquals(2, inDopFull.getOutgoingChannels().size());
	assertEquals(2, inDopHalf.getOutgoingChannels().size());
	assertEquals(fullDop, inDopFull.getParallelism());
	assertEquals(halfDop, inDopHalf.getParallelism());

	// check union nodes
	assertEquals(fullDop, unionDopFull.getParallelism());
	assertEquals(halfDop, unionDopHalf.getParallelism());

	// check out map nodes
	assertEquals(fullDop, outDopFull.getParallelism());
	assertEquals(halfDop, outDopHalf.getParallelism());

	// check Union -> outMap ship strategies
	assertEquals(ShipStrategyType.FORWARD, outDopHalf.getInput().getShipStrategy());
	assertEquals(ShipStrategyType.FORWARD, outDopFull.getInput().getShipStrategy());

	// check inMap -> Union ship strategies
	Channel fullFull;
	Channel fullHalf;
	Channel halfFull;
	Channel halfHalf;

	if (inDopFull.getOutgoingChannels().get(0).getTarget() == unionDopFull) {
		fullFull = inDopFull.getOutgoingChannels().get(0);
		fullHalf = inDopFull.getOutgoingChannels().get(1);
	} else {
		fullFull = inDopFull.getOutgoingChannels().get(1);
		fullHalf = inDopFull.getOutgoingChannels().get(0);
	}
	if (inDopHalf.getOutgoingChannels().get(0).getTarget() == unionDopFull) {
		halfFull = inDopHalf.getOutgoingChannels().get(0);
		halfHalf = inDopHalf.getOutgoingChannels().get(1);
	} else {
		halfFull = inDopHalf.getOutgoingChannels().get(1);
		halfHalf = inDopHalf.getOutgoingChannels().get(0);
	}

	assertEquals(ShipStrategyType.FORWARD, fullFull.getShipStrategy());
	assertEquals(ShipStrategyType.FORWARD, halfHalf.getShipStrategy());
	assertEquals(ShipStrategyType.PARTITION_RANDOM, fullHalf.getShipStrategy());
	assertEquals(ShipStrategyType.PARTITION_RANDOM, halfFull.getShipStrategy());
}

Example 20

Source File: FlinkFlowStep.java From cascading-flink with Apache License 2.0

4 votes

private DataSet<Tuple3<Tuple, Integer, Tuple>> prepareBufferCoGroupInput(List<DataSet<Tuple>> inputs,
					FlowNode node, Fields[] inputFields, Fields[] keyFields, String[][] flinkKeys, int dop) {

	DataSet<Tuple3<Tuple, Integer, Tuple>> coGroupInput = null;

	for(int i=0; i<inputs.size(); i++) {

		// get Flink DataSet
		DataSet<Tuple> input = inputs.get(i);

		// get keys
		int[] keyPos = inputFields[i].getPos(keyFields[i]);

		if(keyFields[i].isNone()) {
			// set default key
			keyFields[i] = new Fields("defaultKey");
		}

		TupleTypeInfo keysTypeInfo = inputFields[i].isDefined() ?
				new TupleTypeInfo(inputFields[i].select(keyFields[i])) :
				new TupleTypeInfo(Fields.UNKNOWN);

		TypeInformation<Tuple3<Tuple, Integer, Tuple>> keyedType =
				new org.apache.flink.api.java.typeutils.TupleTypeInfo<>(
						keysTypeInfo,
						BasicTypeInfo.INT_TYPE_INFO,
						new TupleTypeInfo(inputFields[i])
		);

		int inputDop = ((Operator)input).getParallelism();

		// add mapper
		DataSet<Tuple3<Tuple, Integer, Tuple>> keyedInput = input
				.map(new BufferJoinKeyExtractor(i, keyPos))
				.returns(keyedType)
				.setParallelism(inputDop)
				.name("coGroup-" + node.getID());

		// add to groupByInput
		if(coGroupInput == null) {
			coGroupInput = keyedInput;
		}
		else {
			coGroupInput = coGroupInput
					.union(keyedInput);
		}
	}

	return coGroupInput;
}