org.apache.flink.api.common.functions.MapPartitionFunction Java Examples

The following examples show how to use org.apache.flink.api.common.functions.MapPartitionFunction. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DataSetUtils.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
/**
 * Summarize a DataSet of Tuples by collecting single pass statistics for all columns.
 *
 * <p>Example usage:
 * <pre>
 * {@code
 * Dataset<Tuple3<Double, String, Boolean>> input = // [...]
 * Tuple3<NumericColumnSummary,StringColumnSummary, BooleanColumnSummary> summary = DataSetUtils.summarize(input)
 *
 * summary.f0.getStandardDeviation()
 * summary.f1.getMaxLength()
 * }
 * </pre>
 * @return the summary as a Tuple the same width as input rows
 */
public static <R extends Tuple, T extends Tuple> R summarize(DataSet<T> input) throws Exception {
	if (!input.getType().isTupleType()) {
		throw new IllegalArgumentException("summarize() is only implemented for DataSet's of Tuples");
	}
	final TupleTypeInfoBase<?> inType = (TupleTypeInfoBase<?>) input.getType();
	DataSet<TupleSummaryAggregator<R>> result = input.mapPartition(new MapPartitionFunction<T, TupleSummaryAggregator<R>>() {
		@Override
		public void mapPartition(Iterable<T> values, Collector<TupleSummaryAggregator<R>> out) throws Exception {
			TupleSummaryAggregator<R> aggregator = SummaryAggregatorFactory.create(inType);
			for (Tuple value : values) {
				aggregator.aggregate(value);
			}
			out.collect(aggregator);
		}
	}).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() {
		@Override
		public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {
			agg1.combine(agg2);
			return agg1;
		}
	});
	return result.collect().get(0).result();
}
 
Example #2
Source File: DataSetUtil.java    From Alink with Apache License 2.0 6 votes vote down vote up
/**
 * Count number of records in the dataset.
 *
 * @return a dataset of one record, recording the number of records of [[dataset]]
 */
public static <T> DataSet<Long> count(DataSet<T> dataSet) {
    return dataSet
        .mapPartition(new MapPartitionFunction<T, Long>() {
            @Override
            public void mapPartition(Iterable<T> values, Collector<Long> out) throws Exception {
                long cnt = 0L;
                for (T v : values) {
                    cnt++;
                }
                out.collect(cnt);
            }
        })
        .name("count_dataset")
        .returns(Types.LONG)
        .reduce(new ReduceFunction<Long>() {
            @Override
            public Long reduce(Long value1, Long value2) throws Exception {
                return value1 + value2;
            }
        });
}
 
Example #3
Source File: BaseComQueue.java    From Alink with Apache License 2.0 6 votes vote down vote up
private <T> void createRelationshipAndCachedData(DataSet<T> data, final String key) {
	final int localSessionId = sessionId;
	if (cacheDataRel == null) {
		cacheDataRel = clearObjs(
			BatchOperator
				.getExecutionEnvironmentFromDataSets(data)
				.fromElements(new byte[0])
				.mapPartition(new MapPartitionFunction<byte[], byte[]>() {
					@Override
					public void mapPartition(Iterable<byte[]> values, Collector<byte[]> out) throws Exception {
						//pass
					}
				})
		);
	}

	DataSet<Tuple2<Integer, Long>> rowCount = DataSetUtils.countElementsPerPartition(data);

	cacheDataRel = data.mapPartition(new PutCachedData<T>(key, localSessionId))
		.withBroadcastSet(cacheDataRel, "rel")
		.withBroadcastSet(rowCount, "rowCount")
		.name("cachedDataRel@" + key);

	cacheDataObjNames.add(key);
}
 
Example #4
Source File: BaseComQueue.java    From Alink with Apache License 2.0 6 votes vote down vote up
private DataSet<byte[]> clearObjs(DataSet<byte[]> raw) {
	final int localSessionId = sessionId;
	DataSet<byte[]> clear = expandDataSet2MaxParallelism(
		BatchOperator
			.getExecutionEnvironmentFromDataSets(raw)
			.fromElements(0))
		.mapPartition(new MapPartitionFunction<Integer, byte[]>() {
			@Override
			public void mapPartition(Iterable<Integer> values, Collector<byte[]> out) {
				SessionSharedObjs.clear(localSessionId);
			}
		});
	return raw
		.map(new MapFunction<byte[], byte[]>() {
			@Override
			public byte[] map(byte[] value) {
				return value;
			}
		})
		.withBroadcastSet(clear, "barrier")
		.name("clearReturn");

}
 
Example #5
Source File: BaseComQueue.java    From Alink with Apache License 2.0 6 votes vote down vote up
private DataSet<byte[]> loopStartDataSet(ExecutionEnvironment env) {
	MapPartitionOperator<Integer, byte[]> initial = env
		.fromElements(1)
		.rebalance()
		.mapPartition(new MapPartitionFunction<Integer, byte[]>() {
			@Override
			public void mapPartition(Iterable<Integer> values, Collector<byte[]> out) {
				//pass
			}
		}).name("iterInitialize");

	if (cacheDataRel != null) {
		initial = initial.withBroadcastSet(cacheDataRel, "rel");
	}

	return initial;
}
 
Example #6
Source File: FpGrowthBatchOp.java    From Alink with Apache License 2.0 6 votes vote down vote up
/**
 * Count number of records in the dataset.
 *
 * @return a dataset of one record, recording the number of records of "dataSet".
 */
private static <T> DataSet<Long> count(DataSet<T> dataSet) {
    return dataSet
        .mapPartition(new MapPartitionFunction<T, Long>() {
            @Override
            public void mapPartition(Iterable<T> values, Collector<Long> out) throws Exception {
                long cnt = 0L;
                for (T v : values) {
                    cnt++;
                }
                out.collect(cnt);
            }
        })
        .name("count_dataset")
        .returns(Types.LONG)
        .reduce(new ReduceFunction<Long>() {
            @Override
            public Long reduce(Long value1, Long value2) throws Exception {
                return value1 + value2;
            }
        });
}
 
Example #7
Source File: PartitionITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testRangePartitionerOnSequenceData() throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSource<Long> dataSource = env.generateSequence(0, 10000);
	KeySelector<Long, Long> keyExtractor = new ObjectSelfKeySelector();

	MapPartitionFunction<Long, Tuple2<Long, Long>> minMaxSelector = new MinMaxSelector<>(new LongComparator(true));

	Comparator<Tuple2<Long, Long>> tuple2Comparator = new Tuple2Comparator(new LongComparator(true));

	List<Tuple2<Long, Long>> collected = dataSource.partitionByRange(keyExtractor).mapPartition(minMaxSelector).collect();
	Collections.sort(collected, tuple2Comparator);

	long previousMax = -1;
	for (Tuple2<Long, Long> tuple2 : collected) {
		if (previousMax == -1) {
			previousMax = tuple2.f1;
		} else {
			long currentMin = tuple2.f0;
			assertTrue(tuple2.f0 < tuple2.f1);
			assertEquals(previousMax + 1, currentMin);
			previousMax = tuple2.f1;
		}
	}
}
 
Example #8
Source File: DataSetUtils.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Summarize a DataSet of Tuples by collecting single pass statistics for all columns.
 *
 * <p>Example usage:
 * <pre>
 * {@code
 * Dataset<Tuple3<Double, String, Boolean>> input = // [...]
 * Tuple3<NumericColumnSummary,StringColumnSummary, BooleanColumnSummary> summary = DataSetUtils.summarize(input)
 *
 * summary.f0.getStandardDeviation()
 * summary.f1.getMaxLength()
 * }
 * </pre>
 * @return the summary as a Tuple the same width as input rows
 */
public static <R extends Tuple, T extends Tuple> R summarize(DataSet<T> input) throws Exception {
	if (!input.getType().isTupleType()) {
		throw new IllegalArgumentException("summarize() is only implemented for DataSet's of Tuples");
	}
	final TupleTypeInfoBase<?> inType = (TupleTypeInfoBase<?>) input.getType();
	DataSet<TupleSummaryAggregator<R>> result = input.mapPartition(new MapPartitionFunction<T, TupleSummaryAggregator<R>>() {
		@Override
		public void mapPartition(Iterable<T> values, Collector<TupleSummaryAggregator<R>> out) throws Exception {
			TupleSummaryAggregator<R> aggregator = SummaryAggregatorFactory.create(inType);
			for (Tuple value : values) {
				aggregator.aggregate(value);
			}
			out.collect(aggregator);
		}
	}).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() {
		@Override
		public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {
			agg1.combine(agg2);
			return agg1;
		}
	});
	return result.collect().get(0).result();
}
 
Example #9
Source File: MapPartitionOperatorBase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
protected List<OUT> executeOnCollections(List<IN> inputData, RuntimeContext ctx, ExecutionConfig executionConfig) throws Exception {
	MapPartitionFunction<IN, OUT> function = this.userFunction.getUserCodeObject();
	
	FunctionUtils.setFunctionRuntimeContext(function, ctx);
	FunctionUtils.openFunction(function, this.parameters);
	
	ArrayList<OUT> result = new ArrayList<OUT>(inputData.size() / 4);

	TypeSerializer<IN> inSerializer = getOperatorInfo().getInputType().createSerializer(executionConfig);
	TypeSerializer<OUT> outSerializer = getOperatorInfo().getOutputType().createSerializer(executionConfig);

	CopyingIterator<IN> source = new CopyingIterator<IN>(inputData.iterator(), inSerializer);
	CopyingListCollector<OUT> resultCollector = new CopyingListCollector<OUT>(result, outSerializer);

	function.mapPartition(source, resultCollector);

	result.trimToSize();
	FunctionUtils.closeFunction(function);
	return result;
}
 
Example #10
Source File: MapPartitionOperator.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
protected MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> translateToDataFlow(Operator<IN> input) {

	String name = getName() != null ? getName() : "MapPartition at " + defaultName;
	// create operator
	MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> po = new MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>>(function, new UnaryOperatorInformation<IN, OUT>(getInputType(), getResultType()), name);
	// set input
	po.setInput(input);
	// set parallelism
	if (this.getParallelism() > 0) {
		// use specified parallelism
		po.setParallelism(this.getParallelism());
	} else {
		// if no parallelism has been specified, use parallelism of input operator to enable chaining
		po.setParallelism(input.getParallelism());
	}

	return po;
}
 
Example #11
Source File: MapPartitionOperator.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
protected MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> translateToDataFlow(Operator<IN> input) {

	String name = getName() != null ? getName() : "MapPartition at " + defaultName;
	// create operator
	MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> po = new MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>>(function, new UnaryOperatorInformation<IN, OUT>(getInputType(), getResultType()), name);
	// set input
	po.setInput(input);
	// set parallelism
	if (this.getParallelism() > 0) {
		// use specified parallelism
		po.setParallelism(this.getParallelism());
	} else {
		// if no parallelism has been specified, use parallelism of input operator to enable chaining
		po.setParallelism(input.getParallelism());
	}

	return po;
}
 
Example #12
Source File: DataSetUtils.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Summarize a DataSet of Tuples by collecting single pass statistics for all columns.
 *
 * <p>Example usage:
 * <pre>
 * {@code
 * Dataset<Tuple3<Double, String, Boolean>> input = // [...]
 * Tuple3<NumericColumnSummary,StringColumnSummary, BooleanColumnSummary> summary = DataSetUtils.summarize(input)
 *
 * summary.f0.getStandardDeviation()
 * summary.f1.getMaxLength()
 * }
 * </pre>
 * @return the summary as a Tuple the same width as input rows
 */
public static <R extends Tuple, T extends Tuple> R summarize(DataSet<T> input) throws Exception {
	if (!input.getType().isTupleType()) {
		throw new IllegalArgumentException("summarize() is only implemented for DataSet's of Tuples");
	}
	final TupleTypeInfoBase<?> inType = (TupleTypeInfoBase<?>) input.getType();
	DataSet<TupleSummaryAggregator<R>> result = input.mapPartition(new MapPartitionFunction<T, TupleSummaryAggregator<R>>() {
		@Override
		public void mapPartition(Iterable<T> values, Collector<TupleSummaryAggregator<R>> out) throws Exception {
			TupleSummaryAggregator<R> aggregator = SummaryAggregatorFactory.create(inType);
			for (Tuple value : values) {
				aggregator.aggregate(value);
			}
			out.collect(aggregator);
		}
	}).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() {
		@Override
		public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {
			agg1.combine(agg2);
			return agg1;
		}
	});
	return result.collect().get(0).result();
}
 
Example #13
Source File: PartitionITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testRangePartitionerOnSequenceData() throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSource<Long> dataSource = env.generateSequence(0, 10000);
	KeySelector<Long, Long> keyExtractor = new ObjectSelfKeySelector();

	MapPartitionFunction<Long, Tuple2<Long, Long>> minMaxSelector = new MinMaxSelector<>(new LongComparator(true));

	Comparator<Tuple2<Long, Long>> tuple2Comparator = new Tuple2Comparator(new LongComparator(true));

	List<Tuple2<Long, Long>> collected = dataSource.partitionByRange(keyExtractor).mapPartition(minMaxSelector).collect();
	Collections.sort(collected, tuple2Comparator);

	long previousMax = -1;
	for (Tuple2<Long, Long> tuple2 : collected) {
		if (previousMax == -1) {
			previousMax = tuple2.f1;
		} else {
			long currentMin = tuple2.f0;
			assertTrue(tuple2.f0 < tuple2.f1);
			assertEquals(previousMax + 1, currentMin);
			previousMax = tuple2.f1;
		}
	}
}
 
Example #14
Source File: PartitionITCase.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Test
public void testRangePartitionerOnSequenceData() throws Exception {
	final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
	DataSource<Long> dataSource = env.generateSequence(0, 10000);
	KeySelector<Long, Long> keyExtractor = new ObjectSelfKeySelector();

	MapPartitionFunction<Long, Tuple2<Long, Long>> minMaxSelector = new MinMaxSelector<>(new LongComparator(true));

	Comparator<Tuple2<Long, Long>> tuple2Comparator = new Tuple2Comparator(new LongComparator(true));

	List<Tuple2<Long, Long>> collected = dataSource.partitionByRange(keyExtractor).mapPartition(minMaxSelector).collect();
	Collections.sort(collected, tuple2Comparator);

	long previousMax = -1;
	for (Tuple2<Long, Long> tuple2 : collected) {
		if (previousMax == -1) {
			previousMax = tuple2.f1;
		} else {
			long currentMin = tuple2.f0;
			assertTrue(tuple2.f0 < tuple2.f1);
			assertEquals(previousMax + 1, currentMin);
			previousMax = tuple2.f1;
		}
	}
}
 
Example #15
Source File: MapPartitionOperatorBase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
protected List<OUT> executeOnCollections(List<IN> inputData, RuntimeContext ctx, ExecutionConfig executionConfig) throws Exception {
	MapPartitionFunction<IN, OUT> function = this.userFunction.getUserCodeObject();
	
	FunctionUtils.setFunctionRuntimeContext(function, ctx);
	FunctionUtils.openFunction(function, this.parameters);
	
	ArrayList<OUT> result = new ArrayList<OUT>(inputData.size() / 4);

	TypeSerializer<IN> inSerializer = getOperatorInfo().getInputType().createSerializer(executionConfig);
	TypeSerializer<OUT> outSerializer = getOperatorInfo().getOutputType().createSerializer(executionConfig);

	CopyingIterator<IN> source = new CopyingIterator<IN>(inputData.iterator(), inSerializer);
	CopyingListCollector<OUT> resultCollector = new CopyingListCollector<OUT>(result, outSerializer);

	function.mapPartition(source, resultCollector);

	result.trimToSize();
	FunctionUtils.closeFunction(function);
	return result;
}
 
Example #16
Source File: MapPartitionOperator.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Override
protected MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> translateToDataFlow(Operator<IN> input) {

	String name = getName() != null ? getName() : "MapPartition at " + defaultName;
	// create operator
	MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>> po = new MapPartitionOperatorBase<IN, OUT, MapPartitionFunction<IN, OUT>>(function, new UnaryOperatorInformation<IN, OUT>(getInputType(), getResultType()), name);
	// set input
	po.setInput(input);
	// set parallelism
	if (this.getParallelism() > 0) {
		// use specified parallelism
		po.setParallelism(this.getParallelism());
	} else {
		// if no parallelism has been specified, use parallelism of input operator to enable chaining
		po.setParallelism(input.getParallelism());
	}

	return po;
}
 
Example #17
Source File: MapPartitionOperatorBase.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Override
protected List<OUT> executeOnCollections(List<IN> inputData, RuntimeContext ctx, ExecutionConfig executionConfig) throws Exception {
	MapPartitionFunction<IN, OUT> function = this.userFunction.getUserCodeObject();
	
	FunctionUtils.setFunctionRuntimeContext(function, ctx);
	FunctionUtils.openFunction(function, this.parameters);
	
	ArrayList<OUT> result = new ArrayList<OUT>(inputData.size() / 4);

	TypeSerializer<IN> inSerializer = getOperatorInfo().getInputType().createSerializer(executionConfig);
	TypeSerializer<OUT> outSerializer = getOperatorInfo().getOutputType().createSerializer(executionConfig);

	CopyingIterator<IN> source = new CopyingIterator<IN>(inputData.iterator(), inSerializer);
	CopyingListCollector<OUT> resultCollector = new CopyingListCollector<OUT>(result, outSerializer);

	function.mapPartition(source, resultCollector);

	result.trimToSize();
	FunctionUtils.closeFunction(function);
	return result;
}
 
Example #18
Source File: TypeExtractor.java    From flink with Apache License 2.0 5 votes vote down vote up
@PublicEvolving
public static <IN, OUT> TypeInformation<OUT> getMapPartitionReturnTypes(MapPartitionFunction<IN, OUT> mapPartitionInterface, TypeInformation<IN> inType,
		String functionName, boolean allowMissing)
{
	return getUnaryOperatorReturnType(
		(Function) mapPartitionInterface,
		MapPartitionFunction.class,
		0,
		1,
		new int[]{1, 0},
		inType,
		functionName,
		allowMissing);
}
 
Example #19
Source File: Preprocessing.java    From Alink with Apache License 2.0 5 votes vote down vote up
public static BatchOperator<?> generateQuantileDiscretizerModel(
	BatchOperator<?> input,
	Params params) {
	String[] continuousColNames = ArrayUtils.removeElements(
		params.get(HasFeatureCols.FEATURE_COLS),
		params.get(HasCategoricalCols.CATEGORICAL_COLS)
	);
	BatchOperator<?> quantileDiscretizerModel;
	if (continuousColNames != null && continuousColNames.length > 0) {
		quantileDiscretizerModel = sample(input, params)
			.linkTo(new QuantileDiscretizerTrainBatchOp()
				.setMLEnvironmentId(input.getMLEnvironmentId())
				.setSelectedCols(continuousColNames)
				.setNumBuckets(params.get(HasMaxBins.MAX_BINS))
			);
	} else {
		QuantileDiscretizerModelDataConverter emptyModel = new QuantileDiscretizerModelDataConverter();

		quantileDiscretizerModel = new DataSetWrapperBatchOp(
			MLEnvironmentFactory
				.get(input.getMLEnvironmentId())
				.getExecutionEnvironment()
				.fromElements(1)
				.mapPartition(new MapPartitionFunction<Integer, Row>() {
					@Override
					public void mapPartition(Iterable<Integer> values, Collector<Row> out) throws Exception {
						//pass
					}
				}),
			emptyModel.getModelSchema().getFieldNames(),
			emptyModel.getModelSchema().getFieldTypes()
		).setMLEnvironmentId(input.getMLEnvironmentId());
	}

	return quantileDiscretizerModel;
}
 
Example #20
Source File: Preprocessing.java    From Alink with Apache License 2.0 5 votes vote down vote up
public static BatchOperator<?> generateStringIndexerModel(BatchOperator<?> input, Params params) {
	String[] categoricalColNames = null;
	if (params.contains(HasCategoricalCols.CATEGORICAL_COLS)) {
		categoricalColNames = params.get(HasCategoricalCols.CATEGORICAL_COLS);
	}
	BatchOperator<?> stringIndexerModel;
	if (categoricalColNames == null || categoricalColNames.length == 0) {
		MultiStringIndexerModelDataConverter emptyModel = new MultiStringIndexerModelDataConverter();

		stringIndexerModel = new DataSetWrapperBatchOp(
			MLEnvironmentFactory
				.get(input.getMLEnvironmentId())
				.getExecutionEnvironment()
				.fromElements(1)
				.mapPartition(new MapPartitionFunction<Integer, Row>() {
					@Override
					public void mapPartition(Iterable<Integer> values, Collector<Row> out) throws Exception {
						//pass
					}
				}),
			emptyModel.getModelSchema().getFieldNames(),
			emptyModel.getModelSchema().getFieldTypes()
		).setMLEnvironmentId(input.getMLEnvironmentId());
	} else {
		stringIndexerModel = new MultiStringIndexerTrainBatchOp()
			.setMLEnvironmentId(input.getMLEnvironmentId())
			.setSelectedCols(categoricalColNames)
			.linkFrom(input);
	}

	return stringIndexerModel;
}
 
Example #21
Source File: Preprocessing.java    From Alink with Apache License 2.0 5 votes vote down vote up
public static DataSet<Object[]> generateLabels(
	BatchOperator<?> input,
	Params params,
	boolean isRegression) {
	DataSet<Object[]> labels;
	if (!isRegression) {
		final String labelColName = params.get(HasLabelCol.LABEL_COL);
		DataSet<Row> labelDataSet = select(input, labelColName).getDataSet();

		labels = distinctLabels(labelDataSet
			.map(new MapFunction<Row, Object>() {
				@Override
				public Object map(Row value) throws Exception {
					return value.getField(0);
				}
			})
		);

	} else {
		labels = MLEnvironmentFactory.get(input.getMLEnvironmentId()).getExecutionEnvironment().fromElements(1)
			.mapPartition(new MapPartitionFunction<Integer, Object[]>() {
				@Override
				public void mapPartition(Iterable<Integer> values, Collector<Object[]> out) throws Exception {
					//pass
				}
			});
	}

	return labels;
}
 
Example #22
Source File: DocCountVectorizerTrainBatchOp.java    From Alink with Apache License 2.0 5 votes vote down vote up
@Override
public DocCountVectorizerTrainBatchOp linkFrom(BatchOperator<?>... inputs) {
    BatchOperator<?> in = checkAndGetFirst(inputs);
    DataSet<DocCountVectorizerModelData> resDocCountModel = generateDocCountModel(getParams(), in);

    DataSet<Row> res = resDocCountModel.mapPartition(new MapPartitionFunction<DocCountVectorizerModelData, Row>() {
        @Override
        public void mapPartition(Iterable<DocCountVectorizerModelData> modelDataList, Collector<Row> collector) {
            new DocCountVectorizerModelDataConverter().save(modelDataList.iterator().next(), collector);
        }
    });
    this.setOutput(res, new DocCountVectorizerModelDataConverter().getModelSchema());

    return this;
}
 
Example #23
Source File: QuantileDiscretizerTrainBatchOp.java    From Alink with Apache License 2.0 5 votes vote down vote up
public static DataSet<Row> transformFeatureBorderToModel(DataSet<FeatureBorder> featureBorderDataSet) {
	return featureBorderDataSet.mapPartition(new MapPartitionFunction<FeatureBorder, Row>() {
		@Override
		public void mapPartition(Iterable<FeatureBorder> values, Collector<Row> out) throws Exception {
			transformFeatureBorderToModel(values, out);
		}
	}).setParallelism(1);
}
 
Example #24
Source File: DataSetUtil.java    From Alink with Apache License 2.0 5 votes vote down vote up
/**
 * Returns an empty dataset of the same type as [[dataSet]].
 */
public static <T> DataSet<T> empty(DataSet<T> dataSet) {
    return dataSet
        .mapPartition(new MapPartitionFunction<T, T>() {
            @Override
            public void mapPartition(Iterable<T> values, Collector<T> out) throws Exception {
            }
        })
        .returns(dataSet.getType());
}
 
Example #25
Source File: TypeExtractor.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@PublicEvolving
public static <IN, OUT> TypeInformation<OUT> getMapPartitionReturnTypes(MapPartitionFunction<IN, OUT> mapPartitionInterface, TypeInformation<IN> inType,
		String functionName, boolean allowMissing)
{
	return getUnaryOperatorReturnType(
		(Function) mapPartitionInterface,
		MapPartitionFunction.class,
		0,
		1,
		new int[]{1, 0},
		inType,
		functionName,
		allowMissing);
}
 
Example #26
Source File: LambdaExtractionTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testMapPartitionLambda() {
	MapPartitionFunction<Tuple2<Tuple1<Integer>, Boolean>, Tuple2<Tuple1<Integer>, String>> f = (i, o) -> {};

	TypeInformation<?> ti = TypeExtractor.getMapPartitionReturnTypes(f, NESTED_TUPLE_BOOLEAN_TYPE, null, true);
	if (!(ti instanceof MissingTypeInfo)) {
		assertTrue(ti.isTupleType());
		assertEquals(2, ti.getArity());
		assertTrue(((TupleTypeInfo<?>) ti).getTypeAt(0).isTupleType());
		assertEquals(((TupleTypeInfo<?>) ti).getTypeAt(1), BasicTypeInfo.STRING_TYPE_INFO);
	}
}
 
Example #27
Source File: LambdaExtractionTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testMapPartitionLambda() {
	MapPartitionFunction<Tuple2<Tuple1<Integer>, Boolean>, Tuple2<Tuple1<Integer>, String>> f = (i, o) -> {};

	TypeInformation<?> ti = TypeExtractor.getMapPartitionReturnTypes(f, NESTED_TUPLE_BOOLEAN_TYPE, null, true);
	if (!(ti instanceof MissingTypeInfo)) {
		assertTrue(ti.isTupleType());
		assertEquals(2, ti.getArity());
		assertTrue(((TupleTypeInfo<?>) ti).getTypeAt(0).isTupleType());
		assertEquals(((TupleTypeInfo<?>) ti).getTypeAt(1), BasicTypeInfo.STRING_TYPE_INFO);
	}
}
 
Example #28
Source File: TypeExtractor.java    From flink with Apache License 2.0 5 votes vote down vote up
@PublicEvolving
public static <IN, OUT> TypeInformation<OUT> getMapPartitionReturnTypes(MapPartitionFunction<IN, OUT> mapPartitionInterface, TypeInformation<IN> inType,
		String functionName, boolean allowMissing)
{
	return getUnaryOperatorReturnType(
		(Function) mapPartitionInterface,
		MapPartitionFunction.class,
		0,
		1,
		new int[]{1, 0},
		inType,
		functionName,
		allowMissing);
}
 
Example #29
Source File: LambdaExtractionTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testMapPartitionLambda() {
	MapPartitionFunction<Tuple2<Tuple1<Integer>, Boolean>, Tuple2<Tuple1<Integer>, String>> f = (i, o) -> {};

	TypeInformation<?> ti = TypeExtractor.getMapPartitionReturnTypes(f, NESTED_TUPLE_BOOLEAN_TYPE, null, true);
	if (!(ti instanceof MissingTypeInfo)) {
		assertTrue(ti.isTupleType());
		assertEquals(2, ti.getArity());
		assertTrue(((TupleTypeInfo<?>) ti).getTypeAt(0).isTupleType());
		assertEquals(((TupleTypeInfo<?>) ti).getTypeAt(1), BasicTypeInfo.STRING_TYPE_INFO);
	}
}
 
Example #30
Source File: MapPartitionOperator.java    From flink with Apache License 2.0 4 votes vote down vote up
public MapPartitionOperator(DataSet<IN> input, TypeInformation<OUT> resultType, MapPartitionFunction<IN, OUT> function, String defaultName) {
	super(input, resultType);

	this.function = function;
	this.defaultName = defaultName;
}