Java Code Examples for org.apache.flink.api.java.DataSet#getType()

The following examples show how to use org.apache.flink.api.java.DataSet#getType() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AggregateOperator.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Non grouped aggregation.
 */
public AggregateOperator(DataSet<IN> input, Aggregations function, int field, String aggregateLocationName) {
	super(Preconditions.checkNotNull(input), input.getType());
	Preconditions.checkNotNull(function);

	this.aggregateLocationName = aggregateLocationName;

	if (!input.getType().isTupleType()) {
		throw new InvalidProgramException("Aggregating on field positions is only possible on tuple data types.");
	}

	TupleTypeInfoBase<?> inType = (TupleTypeInfoBase<?>) input.getType();

	if (field < 0 || field >= inType.getArity()) {
		throw new IllegalArgumentException("Aggregation field position is out of range.");
	}

	AggregationFunctionFactory factory = function.getFactory();
	AggregationFunction<?> aggFunct = factory.createAggregationFunction(inType.getTypeAt(field).getTypeClass());

	// this is the first aggregation operator after a regular data set (non grouped aggregation)
	this.aggregationFunctions.add(aggFunct);
	this.fields.add(field);
	this.grouping = null;
}
 
Example 2
Source File: DataSetUtils.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Summarize a DataSet of Tuples by collecting single pass statistics for all columns.
 *
 * <p>Example usage:
 * <pre>
 * {@code
 * Dataset<Tuple3<Double, String, Boolean>> input = // [...]
 * Tuple3<NumericColumnSummary,StringColumnSummary, BooleanColumnSummary> summary = DataSetUtils.summarize(input)
 *
 * summary.f0.getStandardDeviation()
 * summary.f1.getMaxLength()
 * }
 * </pre>
 * @return the summary as a Tuple the same width as input rows
 */
public static <R extends Tuple, T extends Tuple> R summarize(DataSet<T> input) throws Exception {
	if (!input.getType().isTupleType()) {
		throw new IllegalArgumentException("summarize() is only implemented for DataSet's of Tuples");
	}
	final TupleTypeInfoBase<?> inType = (TupleTypeInfoBase<?>) input.getType();
	DataSet<TupleSummaryAggregator<R>> result = input.mapPartition(new MapPartitionFunction<T, TupleSummaryAggregator<R>>() {
		@Override
		public void mapPartition(Iterable<T> values, Collector<TupleSummaryAggregator<R>> out) throws Exception {
			TupleSummaryAggregator<R> aggregator = SummaryAggregatorFactory.create(inType);
			for (Tuple value : values) {
				aggregator.aggregate(value);
			}
			out.collect(aggregator);
		}
	}).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() {
		@Override
		public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {
			agg1.combine(agg2);
			return agg1;
		}
	});
	return result.collect().get(0).result();
}
 
Example 3
Source File: ProjectOperator.java    From flink with Apache License 2.0 6 votes vote down vote up
public Projection(DataSet<T> ds, int[] fieldIndexes) {

			if (!(ds.getType() instanceof TupleTypeInfo)) {
				throw new UnsupportedOperationException("project() can only be applied to DataSets of Tuples.");
			}

			if (fieldIndexes.length == 0) {
				throw new IllegalArgumentException("project() needs to select at least one (1) field.");
			} else if (fieldIndexes.length > Tuple.MAX_ARITY - 1) {
				throw new IllegalArgumentException(
					"project() may select only up to (" + (Tuple.MAX_ARITY - 1) + ") fields.");
			}

			int maxFieldIndex = ds.getType().getArity();
			for (int fieldIndexe : fieldIndexes) {
				Preconditions.checkElementIndex(fieldIndexe, maxFieldIndex);
			}

			this.ds = ds;
			this.fieldIndexes = fieldIndexes;
		}
 
Example 4
Source File: PartitionOperator.java    From flink with Apache License 2.0 6 votes vote down vote up
private <P> PartitionOperator(DataSet<T> input, PartitionMethod pMethod, Keys<T> pKeys, Partitioner<P> customPartitioner,
		TypeInformation<P> partitionerTypeInfo, DataDistribution distribution, String partitionLocationName) {
	super(input, input.getType());

	Preconditions.checkNotNull(pMethod);
	Preconditions.checkArgument(pKeys != null || pMethod == PartitionMethod.REBALANCE, "Partitioning requires keys");
	Preconditions.checkArgument(pMethod != PartitionMethod.CUSTOM || customPartitioner != null, "Custom partioning requires a partitioner.");
	Preconditions.checkArgument(distribution == null || pMethod == PartitionMethod.RANGE, "Customized data distribution is only neccessary for range partition.");

	if (distribution != null) {
		Preconditions.checkArgument(pKeys.getNumberOfKeyFields() <= distribution.getNumberOfFields(), "The distribution must provide at least as many fields as flat key fields are specified.");
		Preconditions.checkArgument(Arrays.equals(pKeys.getKeyFieldTypes(), Arrays.copyOfRange(distribution.getKeyTypes(), 0, pKeys.getNumberOfKeyFields())),
				"The types of the flat key fields must be equal to the types of the fields of the distribution.");
	}

	if (customPartitioner != null) {
		pKeys.validateCustomPartitioner(customPartitioner, partitionerTypeInfo);
	}

	this.pMethod = pMethod;
	this.pKeys = pKeys;
	this.partitionLocationName = partitionLocationName;
	this.customPartitioner = customPartitioner;
	this.distribution = distribution;
}
 
Example 5
Source File: PythonPlanBinder.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private <IN1, IN2, OUT> void createCoGroupOperation(PythonOperationInfo info, TypeInformation<OUT> type) {
	DataSet<IN1> op1 = sets.getDataSet(info.parentID);
	DataSet<IN2> op2 = sets.getDataSet(info.otherID);
	Keys.ExpressionKeys<IN1> key1 = new Keys.ExpressionKeys<>(info.keys1.toArray(new String[info.keys1.size()]), op1.getType());
	Keys.ExpressionKeys<IN2> key2 = new Keys.ExpressionKeys<>(info.keys2.toArray(new String[info.keys2.size()]), op2.getType());
	PythonCoGroup<IN1, IN2, OUT> pcg = new PythonCoGroup<>(operatorConfig, info.envID, info.setID, type);
	sets.add(info.setID, new CoGroupRawOperator<>(op1, op2, key1, key2, pcg, type, info.name).setParallelism(info.parallelism));
}
 
Example 6
Source File: SortPartitionOperator.java    From flink with Apache License 2.0 5 votes vote down vote up
private SortPartitionOperator(DataSet<T> dataSet, String sortLocationName) {
	super(dataSet, dataSet.getType());

	keys = new ArrayList<>();
	orders = new ArrayList<>();
	this.sortLocationName = sortLocationName;
}
 
Example 7
Source File: FlinkBatchTransformTranslators.java    From flink-dataflow with Apache License 2.0 5 votes vote down vote up
@Override
public void translateNode(GroupByKey.GroupByKeyOnly<K, V> transform, FlinkBatchTranslationContext context) {
	DataSet<KV<K, V>> inputDataSet = context.getInputDataSet(context.getInput(transform));
	GroupReduceFunction<KV<K, V>, KV<K, Iterable<V>>> groupReduceFunction = new FlinkKeyedListAggregationFunction<>();

	TypeInformation<KV<K, Iterable<V>>> typeInformation = context.getTypeInfo(context.getOutput(transform));

	Grouping<KV<K, V>> grouping = new UnsortedGrouping<>(inputDataSet, new Keys.ExpressionKeys<>(new String[]{"key"}, inputDataSet.getType()));

	GroupReduceOperator<KV<K, V>, KV<K, Iterable<V>>> outputDataSet =
			new GroupReduceOperator<>(grouping, typeInformation, groupReduceFunction, transform.getName());
	context.setOutputDataSet(context.getOutput(transform), outputDataSet);
}
 
Example 8
Source File: DeltaIteration.java    From flink with Apache License 2.0 5 votes vote down vote up
public DeltaIteration(ExecutionEnvironment context, TypeInformation<ST> type, DataSet<ST> solutionSet, DataSet<WT> workset, Keys<ST> keys, int maxIterations) {
	initialSolutionSet = solutionSet;
	initialWorkset = workset;
	solutionSetPlaceholder = new SolutionSetPlaceHolder<>(context, solutionSet.getType(), this);
	worksetPlaceholder = new WorksetPlaceHolder<>(context, workset.getType());
	this.keys = keys;
	this.maxIterations = maxIterations;
}
 
Example 9
Source File: DistinctOperator.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
public DistinctOperator(DataSet<T> input, Keys<T> keys, String distinctLocationName) {
	super(input, input.getType());

	this.distinctLocationName = distinctLocationName;

	// if keys is null distinction is done on all fields
	if (keys == null) {
		keys = new Keys.ExpressionKeys<>(input.getType());
	}

	this.keys = keys;
}
 
Example 10
Source File: DeltaIteration.java    From flink with Apache License 2.0 5 votes vote down vote up
public DeltaIteration(ExecutionEnvironment context, TypeInformation<ST> type, DataSet<ST> solutionSet, DataSet<WT> workset, Keys<ST> keys, int maxIterations) {
	initialSolutionSet = solutionSet;
	initialWorkset = workset;
	solutionSetPlaceholder = new SolutionSetPlaceHolder<>(context, solutionSet.getType(), this);
	worksetPlaceholder = new WorksetPlaceHolder<>(context, workset.getType());
	this.keys = keys;
	this.maxIterations = maxIterations;
}
 
Example 11
Source File: SortPartitionOperator.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private SortPartitionOperator(DataSet<T> dataSet, String sortLocationName) {
	super(dataSet, dataSet.getType());

	keys = new ArrayList<>();
	orders = new ArrayList<>();
	this.sortLocationName = sortLocationName;
}
 
Example 12
Source File: DataSetUtils.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Generate a sample of DataSet which contains fixed size elements.
 *
 * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with
 * fraction unless you need exact precision.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param numSamples       The expected sample size.
 * @param seed            Random number generator seed.
 * @return The sampled DataSet
 */
public static <T> DataSet<T> sampleWithSize(
	DataSet <T> input,
	final boolean withReplacement,
	final int numSamples,
	final long seed) {

	SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed);
	MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition);

	// There is no previous group, so the parallelism of GroupReduceOperator is always 1.
	String callLocation = Utils.getCallLocationName();
	SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed);
	return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation);
}
 
Example 13
Source File: ReduceOperator.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * This is the case for a reduce-all case (in contrast to the reduce-per-group case).
 *
 * @param input
 * @param function
 */
public ReduceOperator(DataSet<IN> input, ReduceFunction<IN> function, String defaultName) {
	super(input, input.getType());

	this.function = function;
	this.grouper = null;
	this.defaultName = defaultName;
	this.hint = null;
}
 
Example 14
Source File: DataSetUtils.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Generate a sample of DataSet which contains fixed size elements.
 *
 * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with
 * fraction unless you need exact precision.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param numSamples       The expected sample size.
 * @param seed            Random number generator seed.
 * @return The sampled DataSet
 */
public static <T> DataSet<T> sampleWithSize(
	DataSet <T> input,
	final boolean withReplacement,
	final int numSamples,
	final long seed) {

	SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed);
	MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition);

	// There is no previous group, so the parallelism of GroupReduceOperator is always 1.
	String callLocation = Utils.getCallLocationName();
	SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed);
	return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation);
}
 
Example 15
Source File: DataSetUtils.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Range-partitions a DataSet on the specified fields.
 */
public static <T> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, String... fields) {
	return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, input.getType()), distribution, Utils.getCallLocationName());
}
 
Example 16
Source File: FilterOperator.java    From flink with Apache License 2.0 4 votes vote down vote up
public FilterOperator(DataSet<T> input, FilterFunction<T> function, String defaultName) {
	super(input, input.getType());

	this.function = function;
	this.defaultName = defaultName;
}
 
Example 17
Source File: DataSetUtils.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Range-partitions a DataSet on the specified fields.
 */
public static <T> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, String... fields) {
	return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.ExpressionKeys<>(fields, input.getType()), distribution, Utils.getCallLocationName());
}
 
Example 18
Source File: CrossOperator.java    From flink with Apache License 2.0 4 votes vote down vote up
public CrossProjection(DataSet<I1> ds1, DataSet<I2> ds2, int[] firstFieldIndexes, int[] secondFieldIndexes, CrossHint hint) {

			this.ds1 = ds1;
			this.ds2 = ds2;
			this.hint = hint;

			boolean isFirstTuple;
			boolean isSecondTuple;

			if (ds1.getType() instanceof TupleTypeInfo) {
				numFieldsDs1 = ((TupleTypeInfo<?>) ds1.getType()).getArity();
				isFirstTuple = true;
			} else {
				numFieldsDs1 = 1;
				isFirstTuple = false;
			}
			if (ds2.getType() instanceof TupleTypeInfo) {
				numFieldsDs2 = ((TupleTypeInfo<?>) ds2.getType()).getArity();
				isSecondTuple = true;
			} else {
				numFieldsDs2 = 1;
				isSecondTuple = false;
			}

			boolean isTuple;
			boolean firstInput;

			if (firstFieldIndexes != null && secondFieldIndexes == null) {
				// index array for first input is provided
				firstInput = true;
				isTuple = isFirstTuple;
				this.fieldIndexes = firstFieldIndexes;

				if (this.fieldIndexes.length == 0) {
					// no indexes provided, treat tuple as regular object
					isTuple = false;
				}
			} else if (firstFieldIndexes == null && secondFieldIndexes != null) {
				// index array for second input is provided
				firstInput = false;
				isTuple = isSecondTuple;
				this.fieldIndexes = secondFieldIndexes;

				if (this.fieldIndexes.length == 0) {
					// no indexes provided, treat tuple as regular object
					isTuple = false;
				}
			} else if (firstFieldIndexes == null && secondFieldIndexes == null) {
				throw new IllegalArgumentException("You must provide at least one field index array.");
			} else {
				throw new IllegalArgumentException("You must provide at most one field index array.");
			}

			if (!isTuple && this.fieldIndexes.length != 0) {
				// field index provided for non-Tuple input
				throw new IllegalArgumentException("Input is not a Tuple. Call projectFirst() (or projectSecond()) without arguments to include it.");
			} else if (this.fieldIndexes.length > 22) {
				throw new IllegalArgumentException("You may select only up to twenty-two (22) fields.");
			}

			if (isTuple) {
				this.isFieldInFirst = new boolean[this.fieldIndexes.length];

				// check field indexes and adapt to position in tuple
				int maxFieldIndex = firstInput ? numFieldsDs1 : numFieldsDs2;
				for (int i = 0; i < this.fieldIndexes.length; i++) {
					Preconditions.checkElementIndex(this.fieldIndexes[i], maxFieldIndex);

					if (firstInput) {
						this.isFieldInFirst[i] = true;
					} else {
						this.isFieldInFirst[i] = false;
					}
				}
			} else {
				this.isFieldInFirst = new boolean[]{firstInput};
				this.fieldIndexes = new int[]{-1};
			}

		}
 
Example 19
Source File: DataSetUtils.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Range-partitions a DataSet using the specified key selector function.
 */
public static <T, K extends Comparable<K>> PartitionOperator<T> partitionByRange(DataSet<T> input, DataDistribution distribution, KeySelector<T, K> keyExtractor) {
	final TypeInformation<K> keyType = TypeExtractor.getKeySelectorTypes(keyExtractor, input.getType());
	return new PartitionOperator<>(input, PartitionOperatorBase.PartitionMethod.RANGE, new Keys.SelectorFunctionKeys<>(input.clean(keyExtractor), input.getType(), keyType), distribution, Utils.getCallLocationName());
}
 
Example 20
Source File: EdgeList.java    From flink with Apache License 2.0 3 votes vote down vote up
/**
 * Check whether the edge type of the {@link DataSet} is {@link NullValue}.
 *
 * @param edges data set for introspection
 * @param <T> graph ID type
 * @param <ET> edge value type
 * @return whether the edge type of the {@link DataSet} is {@link NullValue}
 */
private static <T, ET> boolean hasNullValueEdges(DataSet<Edge<T, ET>> edges) {
	TypeInformation<?> genericTypeInfo = edges.getType();
	@SuppressWarnings("unchecked")
	TupleTypeInfo<Tuple3<T, T, ET>> tupleTypeInfo = (TupleTypeInfo<Tuple3<T, T, ET>>) genericTypeInfo;

	return tupleTypeInfo.getTypeAt(2).equals(ValueTypeInfo.NULL_VALUE_TYPE_INFO);
}