org.apache.flink.api.common.functions.RichMapPartitionFunction Java Examples
The following examples show how to use
org.apache.flink.api.common.functions.RichMapPartitionFunction.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ExecutionEnvironmentITCase.java From Flink-CEPplus with Apache License 2.0 | 6 votes |
/** * Ensure that the user can pass a custom configuration object to the LocalEnvironment. */ @Test public void testLocalEnvironmentWithConfig() throws Exception { Configuration conf = new Configuration(); conf.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, PARALLELISM); final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf); env.setParallelism(ExecutionConfig.PARALLELISM_AUTO_MAX); env.getConfig().disableSysoutLogging(); DataSet<Integer> result = env.createInput(new ParallelismDependentInputFormat()) .rebalance() .mapPartition(new RichMapPartitionFunction<Integer, Integer>() { @Override public void mapPartition(Iterable<Integer> values, Collector<Integer> out) throws Exception { out.collect(getRuntimeContext().getIndexOfThisSubtask()); } }); List<Integer> resultCollection = result.collect(); assertEquals(PARALLELISM, resultCollection.size()); }
Example #2
Source File: ExecutionEnvironmentITCase.java From flink with Apache License 2.0 | 6 votes |
/** * Ensure that the user can pass a custom configuration object to the LocalEnvironment. */ @Test public void testLocalEnvironmentWithConfig() throws Exception { Configuration conf = new Configuration(); conf.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, PARALLELISM); final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf); env.getConfig().disableSysoutLogging(); DataSet<Integer> result = env.createInput(new ParallelismDependentInputFormat()) .rebalance() .mapPartition(new RichMapPartitionFunction<Integer, Integer>() { @Override public void mapPartition(Iterable<Integer> values, Collector<Integer> out) throws Exception { out.collect(getRuntimeContext().getIndexOfThisSubtask()); } }); List<Integer> resultCollection = result.collect(); assertEquals(PARALLELISM, resultCollection.size()); }
Example #3
Source File: ExecutionEnvironmentITCase.java From flink with Apache License 2.0 | 6 votes |
/** * Ensure that the user can pass a custom configuration object to the LocalEnvironment. */ @Test public void testLocalEnvironmentWithConfig() throws Exception { Configuration conf = new Configuration(); conf.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, PARALLELISM); final ExecutionEnvironment env = ExecutionEnvironment.createLocalEnvironment(conf); DataSet<Integer> result = env.createInput(new ParallelismDependentInputFormat()) .rebalance() .mapPartition(new RichMapPartitionFunction<Integer, Integer>() { @Override public void mapPartition(Iterable<Integer> values, Collector<Integer> out) throws Exception { out.collect(getRuntimeContext().getIndexOfThisSubtask()); } }); List<Integer> resultCollection = result.collect(); assertEquals(PARALLELISM, resultCollection.size()); }
Example #4
Source File: FunctionCompiler.java From rheem with Apache License 2.0 | 6 votes |
public <I, O> RichMapPartitionFunction<I, O> compile(MapPartitionsDescriptor<I, O> descriptor, FlinkExecutionContext fex){ FunctionDescriptor.ExtendedSerializableFunction<Iterable<I>, Iterable<O>> function = (FunctionDescriptor.ExtendedSerializableFunction<Iterable<I>, Iterable<O>>) descriptor.getJavaImplementation(); return new RichMapPartitionFunction<I, O>() { @Override public void mapPartition(Iterable<I> iterable, Collector<O> collector) throws Exception { function.apply(iterable).forEach( element -> { collector.collect(element); } ); } @Override public void open(Configuration parameters) throws Exception { function.open(fex); } }; }
Example #5
Source File: DataSetUtils.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
/** * Method that goes over all the elements in each partition in order to retrieve * the total number of elements. * * @param input the DataSet received as input * @return a data set containing tuples of subtask index, number of elements mappings. */ public static <T> DataSet<Tuple2<Integer, Long>> countElementsPerPartition(DataSet<T> input) { return input.mapPartition(new RichMapPartitionFunction<T, Tuple2<Integer, Long>>() { @Override public void mapPartition(Iterable<T> values, Collector<Tuple2<Integer, Long>> out) throws Exception { long counter = 0; for (T value : values) { counter++; } out.collect(new Tuple2<>(getRuntimeContext().getIndexOfThisSubtask(), counter)); } }); }
Example #6
Source File: RemoteEnvironmentITCase.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
/** * Ensure that the program parallelism can be set even if the configuration is supplied. */ @Test public void testUserSpecificParallelism() throws Exception { Configuration config = new Configuration(); config.setString(AkkaOptions.STARTUP_TIMEOUT, VALID_STARTUP_TIMEOUT); final URI restAddress = MINI_CLUSTER_RESOURCE.getRestAddres(); final String hostname = restAddress.getHost(); final int port = restAddress.getPort(); final ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment( hostname, port, config ); env.setParallelism(USER_DOP); env.getConfig().disableSysoutLogging(); DataSet<Integer> result = env.createInput(new ParallelismDependentInputFormat()) .rebalance() .mapPartition(new RichMapPartitionFunction<Integer, Integer>() { @Override public void mapPartition(Iterable<Integer> values, Collector<Integer> out) throws Exception { out.collect(getRuntimeContext().getIndexOfThisSubtask()); } }); List<Integer> resultCollection = result.collect(); assertEquals(USER_DOP, resultCollection.size()); }
Example #7
Source File: DataSetUtils.java From Flink-CEPplus with Apache License 2.0 | 5 votes |
/** * Method that assigns a unique {@link Long} value to all elements in the input data set as described below. * <ul> * <li> a map function is applied to the input data set * <li> each map task holds a counter c which is increased for each record * <li> c is shifted by n bits where n = log2(number of parallel tasks) * <li> to create a unique ID among all tasks, the task id is added to the counter * <li> for each record, the resulting counter is collected * </ul> * * @param input the input data set * @return a data set of tuple 2 consisting of ids and initial values. */ public static <T> DataSet<Tuple2<Long, T>> zipWithUniqueId (DataSet <T> input) { return input.mapPartition(new RichMapPartitionFunction<T, Tuple2<Long, T>>() { long maxBitSize = getBitSize(Long.MAX_VALUE); long shifter = 0; long start = 0; long taskId = 0; long label = 0; @Override public void open(Configuration parameters) throws Exception { super.open(parameters); shifter = getBitSize(getRuntimeContext().getNumberOfParallelSubtasks() - 1); taskId = getRuntimeContext().getIndexOfThisSubtask(); } @Override public void mapPartition(Iterable<T> values, Collector<Tuple2<Long, T>> out) throws Exception { for (T value : values) { label = (start << shifter) + taskId; if (getBitSize(start) + shifter < maxBitSize) { out.collect(new Tuple2<>(label, value)); start++; } else { throw new Exception("Exceeded Long value range while generating labels"); } } } }); }
Example #8
Source File: DataSetUtils.java From flink with Apache License 2.0 | 5 votes |
/** * Method that assigns a unique {@link Long} value to all elements in the input data set as described below. * <ul> * <li> a map function is applied to the input data set * <li> each map task holds a counter c which is increased for each record * <li> c is shifted by n bits where n = log2(number of parallel tasks) * <li> to create a unique ID among all tasks, the task id is added to the counter * <li> for each record, the resulting counter is collected * </ul> * * @param input the input data set * @return a data set of tuple 2 consisting of ids and initial values. */ public static <T> DataSet<Tuple2<Long, T>> zipWithUniqueId (DataSet <T> input) { return input.mapPartition(new RichMapPartitionFunction<T, Tuple2<Long, T>>() { long maxBitSize = getBitSize(Long.MAX_VALUE); long shifter = 0; long start = 0; long taskId = 0; long label = 0; @Override public void open(Configuration parameters) throws Exception { super.open(parameters); shifter = getBitSize(getRuntimeContext().getNumberOfParallelSubtasks() - 1); taskId = getRuntimeContext().getIndexOfThisSubtask(); } @Override public void mapPartition(Iterable<T> values, Collector<Tuple2<Long, T>> out) throws Exception { for (T value : values) { label = (start << shifter) + taskId; if (getBitSize(start) + shifter < maxBitSize) { out.collect(new Tuple2<>(label, value)); start++; } else { throw new Exception("Exceeded Long value range while generating labels"); } } } }); }
Example #9
Source File: RemoteEnvironmentITCase.java From flink with Apache License 2.0 | 5 votes |
/** * Ensure that the program parallelism can be set even if the configuration is supplied. */ @Test public void testUserSpecificParallelism() throws Exception { Configuration config = new Configuration(); config.setString(AkkaOptions.STARTUP_TIMEOUT, VALID_STARTUP_TIMEOUT); final URI restAddress = MINI_CLUSTER_RESOURCE.getRestAddres(); final String hostname = restAddress.getHost(); final int port = restAddress.getPort(); final ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment( hostname, port, config ); env.setParallelism(USER_DOP); env.getConfig().disableSysoutLogging(); DataSet<Integer> result = env.createInput(new ParallelismDependentInputFormat()) .rebalance() .mapPartition(new RichMapPartitionFunction<Integer, Integer>() { @Override public void mapPartition(Iterable<Integer> values, Collector<Integer> out) throws Exception { out.collect(getRuntimeContext().getIndexOfThisSubtask()); } }); List<Integer> resultCollection = result.collect(); assertEquals(USER_DOP, resultCollection.size()); }
Example #10
Source File: DataSetUtils.java From flink with Apache License 2.0 | 5 votes |
/** * Method that goes over all the elements in each partition in order to retrieve * the total number of elements. * * @param input the DataSet received as input * @return a data set containing tuples of subtask index, number of elements mappings. */ public static <T> DataSet<Tuple2<Integer, Long>> countElementsPerPartition(DataSet<T> input) { return input.mapPartition(new RichMapPartitionFunction<T, Tuple2<Integer, Long>>() { @Override public void mapPartition(Iterable<T> values, Collector<Tuple2<Integer, Long>> out) throws Exception { long counter = 0; for (T value : values) { counter++; } out.collect(new Tuple2<>(getRuntimeContext().getIndexOfThisSubtask(), counter)); } }); }
Example #11
Source File: DataSetUtils.java From flink with Apache License 2.0 | 5 votes |
/** * Method that assigns a unique {@link Long} value to all elements in the input data set as described below. * <ul> * <li> a map function is applied to the input data set * <li> each map task holds a counter c which is increased for each record * <li> c is shifted by n bits where n = log2(number of parallel tasks) * <li> to create a unique ID among all tasks, the task id is added to the counter * <li> for each record, the resulting counter is collected * </ul> * * @param input the input data set * @return a data set of tuple 2 consisting of ids and initial values. */ public static <T> DataSet<Tuple2<Long, T>> zipWithUniqueId (DataSet <T> input) { return input.mapPartition(new RichMapPartitionFunction<T, Tuple2<Long, T>>() { long maxBitSize = getBitSize(Long.MAX_VALUE); long shifter = 0; long start = 0; long taskId = 0; long label = 0; @Override public void open(Configuration parameters) throws Exception { super.open(parameters); shifter = getBitSize(getRuntimeContext().getNumberOfParallelSubtasks() - 1); taskId = getRuntimeContext().getIndexOfThisSubtask(); } @Override public void mapPartition(Iterable<T> values, Collector<Tuple2<Long, T>> out) throws Exception { for (T value : values) { label = (start << shifter) + taskId; if (getBitSize(start) + shifter < maxBitSize) { out.collect(new Tuple2<>(label, value)); start++; } else { throw new Exception("Exceeded Long value range while generating labels"); } } } }); }
Example #12
Source File: DataSetUtils.java From flink with Apache License 2.0 | 5 votes |
/** * Method that goes over all the elements in each partition in order to retrieve * the total number of elements. * * @param input the DataSet received as input * @return a data set containing tuples of subtask index, number of elements mappings. */ public static <T> DataSet<Tuple2<Integer, Long>> countElementsPerPartition(DataSet<T> input) { return input.mapPartition(new RichMapPartitionFunction<T, Tuple2<Integer, Long>>() { @Override public void mapPartition(Iterable<T> values, Collector<Tuple2<Integer, Long>> out) throws Exception { long counter = 0; for (T value : values) { counter++; } out.collect(new Tuple2<>(getRuntimeContext().getIndexOfThisSubtask(), counter)); } }); }
Example #13
Source File: RemoteEnvironmentITCase.java From flink with Apache License 2.0 | 5 votes |
/** * Ensure that the program parallelism can be set even if the configuration is supplied. */ @Test public void testUserSpecificParallelism() throws Exception { Configuration config = new Configuration(); config.setString(AkkaOptions.STARTUP_TIMEOUT, VALID_STARTUP_TIMEOUT); final URI restAddress = MINI_CLUSTER_RESOURCE.getRestAddres(); final String hostname = restAddress.getHost(); final int port = restAddress.getPort(); final ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment( hostname, port, config ); env.setParallelism(USER_DOP); DataSet<Integer> result = env.createInput(new ParallelismDependentInputFormat()) .rebalance() .mapPartition(new RichMapPartitionFunction<Integer, Integer>() { @Override public void mapPartition(Iterable<Integer> values, Collector<Integer> out) throws Exception { out.collect(getRuntimeContext().getIndexOfThisSubtask()); } }); List<Integer> resultCollection = result.collect(); assertEquals(USER_DOP, resultCollection.size()); }
Example #14
Source File: PartitionMapOperatorTest.java From flink with Apache License 2.0 | 4 votes |
@Test public void testMapPartitionWithRuntimeContext() { try { final String taskName = "Test Task"; final AtomicBoolean opened = new AtomicBoolean(); final AtomicBoolean closed = new AtomicBoolean(); final MapPartitionFunction<String, Integer> parser = new RichMapPartitionFunction<String, Integer>() { @Override public void open(Configuration parameters) throws Exception { opened.set(true); RuntimeContext ctx = getRuntimeContext(); assertEquals(0, ctx.getIndexOfThisSubtask()); assertEquals(1, ctx.getNumberOfParallelSubtasks()); assertEquals(taskName, ctx.getTaskName()); } @Override public void mapPartition(Iterable<String> values, Collector<Integer> out) { for (String s : values) { out.collect(Integer.parseInt(s)); } } @Override public void close() throws Exception { closed.set(true); } }; MapPartitionOperatorBase<String, Integer, MapPartitionFunction<String, Integer>> op = new MapPartitionOperatorBase<String, Integer, MapPartitionFunction<String,Integer>>( parser, new UnaryOperatorInformation<String, Integer>(BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO), taskName); List<String> input = new ArrayList<String>(asList("1", "2", "3", "4", "5", "6")); final TaskInfo taskInfo = new TaskInfo(taskName, 1, 0, 1, 0); ExecutionConfig executionConfig = new ExecutionConfig(); executionConfig.disableObjectReuse(); List<Integer> resultMutableSafe = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig); executionConfig.enableObjectReuse(); List<Integer> resultRegular = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig); assertEquals(asList(1, 2, 3, 4, 5, 6), resultMutableSafe); assertEquals(asList(1, 2, 3, 4, 5, 6), resultRegular); assertTrue(opened.get()); assertTrue(closed.get()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
Example #15
Source File: DataSetUtils.java From flink with Apache License 2.0 | 4 votes |
/** * Method that assigns a unique {@link Long} value to all elements in the input data set. The generated values are * consecutive. * * @param input the input data set * @return a data set of tuple 2 consisting of consecutive ids and initial values. */ public static <T> DataSet<Tuple2<Long, T>> zipWithIndex(DataSet<T> input) { DataSet<Tuple2<Integer, Long>> elementCount = countElementsPerPartition(input); return input.mapPartition(new RichMapPartitionFunction<T, Tuple2<Long, T>>() { long start = 0; @Override public void open(Configuration parameters) throws Exception { super.open(parameters); List<Tuple2<Integer, Long>> offsets = getRuntimeContext().getBroadcastVariableWithInitializer( "counts", new BroadcastVariableInitializer<Tuple2<Integer, Long>, List<Tuple2<Integer, Long>>>() { @Override public List<Tuple2<Integer, Long>> initializeBroadcastVariable(Iterable<Tuple2<Integer, Long>> data) { // sort the list by task id to calculate the correct offset List<Tuple2<Integer, Long>> sortedData = new ArrayList<>(); for (Tuple2<Integer, Long> datum : data) { sortedData.add(datum); } Collections.sort(sortedData, new Comparator<Tuple2<Integer, Long>>() { @Override public int compare(Tuple2<Integer, Long> o1, Tuple2<Integer, Long> o2) { return o1.f0.compareTo(o2.f0); } }); return sortedData; } }); // compute the offset for each partition for (int i = 0; i < getRuntimeContext().getIndexOfThisSubtask(); i++) { start += offsets.get(i).f1; } } @Override public void mapPartition(Iterable<T> values, Collector<Tuple2<Long, T>> out) throws Exception { for (T value: values) { out.collect(new Tuple2<>(start++, value)); } } }).withBroadcastSet(elementCount, "counts"); }
Example #16
Source File: ParallelPrefixSpan.java From Alink with Apache License 2.0 | 4 votes |
/** * Generate frequent sequence patterns using PrefixSpan algorithm. * * @return Frequent sequence patterns and their supports. */ public DataSet<Tuple2<int[], Integer>> run() { final int parallelism = BatchOperator.getExecutionEnvironmentFromDataSets(sequences).getParallelism(); DataSet<Tuple2<Integer, int[]>> partitionedSequence = partitionSequence(sequences, itemCounts, parallelism); final int maxLength = maxPatternLength; return partitionedSequence .partitionCustom(new Partitioner<Integer>() { @Override public int partition(Integer key, int numPartitions) { return key % numPartitions; } }, 0) .mapPartition(new RichMapPartitionFunction<Tuple2<Integer, int[]>, Tuple2<int[], Integer>>() { @Override public void mapPartition(Iterable<Tuple2<Integer, int[]>> values, Collector<Tuple2<int[], Integer>> out) throws Exception { List<Long> bc1 = getRuntimeContext().getBroadcastVariable("minSupportCnt"); List<Tuple2<Integer, Integer>> bc2 = getRuntimeContext().getBroadcastVariable("itemCounts"); int taskId = getRuntimeContext().getIndexOfThisSubtask(); long minSuppCnt = bc1.get(0); List<int[]> allSeq = new ArrayList<>(); values.forEach(t -> allSeq.add(t.f1)); List<Postfix> initialPostfixes = new ArrayList<>(allSeq.size()); for (int i = 0; i < allSeq.size(); i++) { initialPostfixes.add(new Postfix(i)); } bc2.forEach(itemCount -> { int item = itemCount.f0; if (item % parallelism == taskId) { generateFreqPattern(allSeq, initialPostfixes, item, minSuppCnt, maxLength, out); } }); } }) .withBroadcastSet(this.minSupportCnt, "minSupportCnt") .withBroadcastSet(this.itemCounts, "itemCounts") .name("generate_freq_pattern"); }
Example #17
Source File: AlsTrainBatchOp.java From Alink with Apache License 2.0 | 4 votes |
/** * Matrix decomposition using ALS algorithm. * * @param inputs a dataset of user-item-rating tuples * @return user factors and item factors. */ @Override public AlsTrainBatchOp linkFrom(BatchOperator<?>... inputs) { BatchOperator<?> in = checkAndGetFirst(inputs); final String userColName = getUserCol(); final String itemColName = getItemCol(); final String rateColName = getRateCol(); final double lambda = getLambda(); final int rank = getRank(); final int numIter = getNumIter(); final boolean nonNegative = getNonnegative(); final boolean implicitPrefs = getImplicitPrefs(); final double alpha = getAlpha(); final int numMiniBatches = getNumBlocks(); final int userColIdx = TableUtil.findColIndexWithAssertAndHint(in.getColNames(), userColName); final int itemColIdx = TableUtil.findColIndexWithAssertAndHint(in.getColNames(), itemColName); final int rateColIdx = TableUtil.findColIndexWithAssertAndHint(in.getColNames(), rateColName); // tuple3: userId, itemId, rating DataSet<Tuple3<Long, Long, Float>> alsInput = in.getDataSet() .map(new MapFunction<Row, Tuple3<Long, Long, Float>>() { @Override public Tuple3<Long, Long, Float> map(Row value) { return new Tuple3<>(((Number) value.getField(userColIdx)).longValue(), ((Number) value.getField(itemColIdx)).longValue(), ((Number) value.getField(rateColIdx)).floatValue()); } }); AlsTrain als = new AlsTrain(rank, numIter, lambda, implicitPrefs, alpha, numMiniBatches, nonNegative); DataSet<Tuple3<Byte, Long, float[]>> factors = als.fit(alsInput); DataSet<Row> output = factors.mapPartition(new RichMapPartitionFunction<Tuple3<Byte, Long, float[]>, Row>() { @Override public void mapPartition(Iterable<Tuple3<Byte, Long, float[]>> values, Collector<Row> out) { new AlsModelDataConverter(userColName, itemColName).save(values, out); } }); this.setOutput(output, new AlsModelDataConverter(userColName, itemColName).getModelSchema()); return this; }
Example #18
Source File: BaseTuning.java From Alink with Apache License 2.0 | 4 votes |
private DataSet<Tuple2<Integer, Row>> split(BatchOperator<?> data, int k) { DataSet<Row> input = shuffle(data.getDataSet()); DataSet<Tuple2<Integer, Long>> counts = DataSetUtils.countElementsPerPartition(input); return input .mapPartition(new RichMapPartitionFunction<Row, Tuple2<Integer, Row>>() { long taskStart = 0L; long totalNumInstance = 0L; @Override public void open(Configuration parameters) throws Exception { List<Tuple2<Integer, Long>> counts1 = getRuntimeContext().getBroadcastVariable("counts"); int taskId = getRuntimeContext().getIndexOfThisSubtask(); for (Tuple2<Integer, Long> cnt : counts1) { if (taskId < cnt.f0) { taskStart += cnt.f1; } totalNumInstance += cnt.f1; } } @Override public void mapPartition(Iterable<Row> values, Collector<Tuple2<Integer, Row>> out) throws Exception { DistributedInfo distributedInfo = new DefaultDistributedInfo(); Tuple2<Integer, Long> split1 = new Tuple2<>(-1, -1L); long lcnt = taskStart; for (int i = 0; i <= k; ++i) { long sp = distributedInfo.startPos(i, k, totalNumInstance); long lrc = distributedInfo.localRowCnt(i, k, totalNumInstance); if (taskStart < sp) { split1.f0 = i - 1; split1.f1 = distributedInfo.startPos(i - 1, k, totalNumInstance) + distributedInfo.localRowCnt(i - 1, k, totalNumInstance); break; } if (taskStart == sp) { split1.f0 = i; split1.f1 = sp + lrc; break; } } for (Row val : values) { if (lcnt >= split1.f1) { split1.f0 += 1; split1.f1 = distributedInfo.localRowCnt(split1.f0, k, totalNumInstance) + lcnt; } out.collect(Tuple2.of(split1.f0, val)); lcnt++; } } }).withBroadcastSet(counts, "counts"); }
Example #19
Source File: PartitionMapOperatorTest.java From flink with Apache License 2.0 | 4 votes |
@Test public void testMapPartitionWithRuntimeContext() { try { final String taskName = "Test Task"; final AtomicBoolean opened = new AtomicBoolean(); final AtomicBoolean closed = new AtomicBoolean(); final MapPartitionFunction<String, Integer> parser = new RichMapPartitionFunction<String, Integer>() { @Override public void open(Configuration parameters) throws Exception { opened.set(true); RuntimeContext ctx = getRuntimeContext(); assertEquals(0, ctx.getIndexOfThisSubtask()); assertEquals(1, ctx.getNumberOfParallelSubtasks()); assertEquals(taskName, ctx.getTaskName()); } @Override public void mapPartition(Iterable<String> values, Collector<Integer> out) { for (String s : values) { out.collect(Integer.parseInt(s)); } } @Override public void close() throws Exception { closed.set(true); } }; MapPartitionOperatorBase<String, Integer, MapPartitionFunction<String, Integer>> op = new MapPartitionOperatorBase<String, Integer, MapPartitionFunction<String,Integer>>( parser, new UnaryOperatorInformation<String, Integer>(BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO), taskName); List<String> input = new ArrayList<String>(asList("1", "2", "3", "4", "5", "6")); final TaskInfo taskInfo = new TaskInfo(taskName, 1, 0, 1, 0); ExecutionConfig executionConfig = new ExecutionConfig(); executionConfig.disableObjectReuse(); List<Integer> resultMutableSafe = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig); executionConfig.enableObjectReuse(); List<Integer> resultRegular = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig); assertEquals(asList(1, 2, 3, 4, 5, 6), resultMutableSafe); assertEquals(asList(1, 2, 3, 4, 5, 6), resultRegular); assertTrue(opened.get()); assertTrue(closed.get()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
Example #20
Source File: DataSetUtils.java From flink with Apache License 2.0 | 4 votes |
/** * Method that assigns a unique {@link Long} value to all elements in the input data set. The generated values are * consecutive. * * @param input the input data set * @return a data set of tuple 2 consisting of consecutive ids and initial values. */ public static <T> DataSet<Tuple2<Long, T>> zipWithIndex(DataSet<T> input) { DataSet<Tuple2<Integer, Long>> elementCount = countElementsPerPartition(input); return input.mapPartition(new RichMapPartitionFunction<T, Tuple2<Long, T>>() { long start = 0; @Override public void open(Configuration parameters) throws Exception { super.open(parameters); List<Tuple2<Integer, Long>> offsets = getRuntimeContext().getBroadcastVariableWithInitializer( "counts", new BroadcastVariableInitializer<Tuple2<Integer, Long>, List<Tuple2<Integer, Long>>>() { @Override public List<Tuple2<Integer, Long>> initializeBroadcastVariable(Iterable<Tuple2<Integer, Long>> data) { // sort the list by task id to calculate the correct offset List<Tuple2<Integer, Long>> sortedData = new ArrayList<>(); for (Tuple2<Integer, Long> datum : data) { sortedData.add(datum); } Collections.sort(sortedData, new Comparator<Tuple2<Integer, Long>>() { @Override public int compare(Tuple2<Integer, Long> o1, Tuple2<Integer, Long> o2) { return o1.f0.compareTo(o2.f0); } }); return sortedData; } }); // compute the offset for each partition for (int i = 0; i < getRuntimeContext().getIndexOfThisSubtask(); i++) { start += offsets.get(i).f1; } } @Override public void mapPartition(Iterable<T> values, Collector<Tuple2<Long, T>> out) throws Exception { for (T value: values) { out.collect(new Tuple2<>(start++, value)); } } }).withBroadcastSet(elementCount, "counts"); }
Example #21
Source File: PartitionMapOperatorTest.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
@Test public void testMapPartitionWithRuntimeContext() { try { final String taskName = "Test Task"; final AtomicBoolean opened = new AtomicBoolean(); final AtomicBoolean closed = new AtomicBoolean(); final MapPartitionFunction<String, Integer> parser = new RichMapPartitionFunction<String, Integer>() { @Override public void open(Configuration parameters) throws Exception { opened.set(true); RuntimeContext ctx = getRuntimeContext(); assertEquals(0, ctx.getIndexOfThisSubtask()); assertEquals(1, ctx.getNumberOfParallelSubtasks()); assertEquals(taskName, ctx.getTaskName()); } @Override public void mapPartition(Iterable<String> values, Collector<Integer> out) { for (String s : values) { out.collect(Integer.parseInt(s)); } } @Override public void close() throws Exception { closed.set(true); } }; MapPartitionOperatorBase<String, Integer, MapPartitionFunction<String, Integer>> op = new MapPartitionOperatorBase<String, Integer, MapPartitionFunction<String,Integer>>( parser, new UnaryOperatorInformation<String, Integer>(BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO), taskName); List<String> input = new ArrayList<String>(asList("1", "2", "3", "4", "5", "6")); final TaskInfo taskInfo = new TaskInfo(taskName, 1, 0, 1, 0); ExecutionConfig executionConfig = new ExecutionConfig(); executionConfig.disableObjectReuse(); List<Integer> resultMutableSafe = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig); executionConfig.enableObjectReuse(); List<Integer> resultRegular = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig); assertEquals(asList(1, 2, 3, 4, 5, 6), resultMutableSafe); assertEquals(asList(1, 2, 3, 4, 5, 6), resultRegular); assertTrue(opened.get()); assertTrue(closed.get()); } catch (Exception e) { e.printStackTrace(); fail(e.getMessage()); } }
Example #22
Source File: DataSetUtils.java From Flink-CEPplus with Apache License 2.0 | 4 votes |
/** * Method that assigns a unique {@link Long} value to all elements in the input data set. The generated values are * consecutive. * * @param input the input data set * @return a data set of tuple 2 consisting of consecutive ids and initial values. */ public static <T> DataSet<Tuple2<Long, T>> zipWithIndex(DataSet<T> input) { DataSet<Tuple2<Integer, Long>> elementCount = countElementsPerPartition(input); return input.mapPartition(new RichMapPartitionFunction<T, Tuple2<Long, T>>() { long start = 0; @Override public void open(Configuration parameters) throws Exception { super.open(parameters); List<Tuple2<Integer, Long>> offsets = getRuntimeContext().getBroadcastVariableWithInitializer( "counts", new BroadcastVariableInitializer<Tuple2<Integer, Long>, List<Tuple2<Integer, Long>>>() { @Override public List<Tuple2<Integer, Long>> initializeBroadcastVariable(Iterable<Tuple2<Integer, Long>> data) { // sort the list by task id to calculate the correct offset List<Tuple2<Integer, Long>> sortedData = new ArrayList<>(); for (Tuple2<Integer, Long> datum : data) { sortedData.add(datum); } Collections.sort(sortedData, new Comparator<Tuple2<Integer, Long>>() { @Override public int compare(Tuple2<Integer, Long> o1, Tuple2<Integer, Long> o2) { return o1.f0.compareTo(o2.f0); } }); return sortedData; } }); // compute the offset for each partition for (int i = 0; i < getRuntimeContext().getIndexOfThisSubtask(); i++) { start += offsets.get(i).f1; } } @Override public void mapPartition(Iterable<T> values, Collector<Tuple2<Long, T>> out) throws Exception { for (T value: values) { out.collect(new Tuple2<>(start++, value)); } } }).withBroadcastSet(elementCount, "counts"); }