/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.api.dag; import org.apache.flink.annotation.Internal; import org.apache.flink.api.common.functions.InvalidTypesException; import org.apache.flink.api.common.operators.ResourceSpec; import org.apache.flink.api.common.operators.util.OperatorValidationUtils; import org.apache.flink.api.common.typeinfo.TypeInformation; import org.apache.flink.api.java.typeutils.MissingTypeInfo; import org.apache.flink.util.Preconditions; import javax.annotation.Nullable; import java.util.Collection; import static org.apache.flink.util.Preconditions.checkArgument; import static org.apache.flink.util.Preconditions.checkNotNull; /** * A {@code Transformation} represents the operation that creates a * DataStream. Every DataStream has an underlying * {@code Transformation} that is the origin of said DataStream. * * <p>API operations such as DataStream#map create * a tree of {@code Transformation}s underneath. When the stream program is to be executed * this graph is translated to a StreamGraph using StreamGraphGenerator. * * <p>A {@code Transformation} does not necessarily correspond to a physical operation * at runtime. Some operations are only logical concepts. Examples of this are union, * split/select data stream, partitioning. * * <p>The following graph of {@code Transformations}: * <pre>{@code * Source Source * + + * | | * v v * Rebalance HashPartition * + + * | | * | | * +------>Union<------+ * + * | * v * Split * + * | * v * Select * + * v * Map * + * | * v * Sink * }</pre> * * <p>Would result in this graph of operations at runtime: * <pre>{@code * Source Source * + + * | | * | | * +------->Map<-------+ * + * | * v * Sink * }</pre> * * <p>The information about partitioning, union, split/select end up being encoded in the edges * that connect the sources to the map operation. * * @param <T> The type of the elements that result from this {@code Transformation} */ @Internal public abstract class Transformation<T> { // Has to be equal to StreamGraphGenerator.UPPER_BOUND_MAX_PARALLELISM public static final int UPPER_BOUND_MAX_PARALLELISM = 1 << 15; public static final int DEFAULT_MANAGED_MEMORY_WEIGHT = 1; // This is used to assign a unique ID to every Transformation protected static Integer idCounter = 0; public static int getNewNodeId() { idCounter++; return idCounter; } protected final int id; protected String name; protected TypeInformation<T> outputType; // This is used to handle MissingTypeInfo. As long as the outputType has not been queried // it can still be changed using setOutputType(). Afterwards an exception is thrown when // trying to change the output type. protected boolean typeUsed; private int parallelism; /** * The maximum parallelism for this stream transformation. It defines the upper limit for * dynamic scaling and the number of key groups used for partitioned state. */ private int maxParallelism = -1; /** * The minimum resources for this stream transformation. It defines the lower limit for * dynamic resources resize in future plan. */ private ResourceSpec minResources = ResourceSpec.DEFAULT; /** * The preferred resources for this stream transformation. It defines the upper limit for * dynamic resource resize in future plan. */ private ResourceSpec preferredResources = ResourceSpec.DEFAULT; /** * This weight indicates how much this transformation relies on managed memory, so that * transformation highly relies on managed memory would be able to acquire more managed * memory in runtime (linear association). Note that it only works in cases of UNKNOWN * resources. */ private int managedMemoryWeight = DEFAULT_MANAGED_MEMORY_WEIGHT; /** * User-specified ID for this transformation. This is used to assign the * same operator ID across job restarts. There is also the automatically * generated {@link #id}, which is assigned from a static counter. That * field is independent from this. */ private String uid; private String userProvidedNodeHash; protected long bufferTimeout = -1; private String slotSharingGroup; @Nullable private String coLocationGroupKey; /** * Creates a new {@code Transformation} with the given name, output type and parallelism. * * @param name The name of the {@code Transformation}, this will be shown in Visualizations and the Log * @param outputType The output type of this {@code Transformation} * @param parallelism The parallelism of this {@code Transformation} */ public Transformation(String name, TypeInformation<T> outputType, int parallelism) { this.id = getNewNodeId(); this.name = Preconditions.checkNotNull(name); this.outputType = outputType; this.parallelism = parallelism; this.slotSharingGroup = null; } /** * Returns the unique ID of this {@code Transformation}. */ public int getId() { return id; } /** * Changes the name of this {@code Transformation}. */ public void setName(String name) { this.name = name; } /** * Returns the name of this {@code Transformation}. */ public String getName() { return name; } /** * Returns the parallelism of this {@code Transformation}. */ public int getParallelism() { return parallelism; } /** * Sets the parallelism of this {@code Transformation}. * * @param parallelism The new parallelism to set on this {@code Transformation}. */ public void setParallelism(int parallelism) { OperatorValidationUtils.validateParallelism(parallelism); this.parallelism = parallelism; } /** * Gets the maximum parallelism for this stream transformation. * * @return Maximum parallelism of this transformation. */ public int getMaxParallelism() { return maxParallelism; } /** * Sets the maximum parallelism for this stream transformation. * * @param maxParallelism Maximum parallelism for this stream transformation. */ public void setMaxParallelism(int maxParallelism) { OperatorValidationUtils.validateMaxParallelism(maxParallelism, UPPER_BOUND_MAX_PARALLELISM); this.maxParallelism = maxParallelism; } /** * Sets the minimum and preferred resources for this stream transformation. * * @param minResources The minimum resource of this transformation. * @param preferredResources The preferred resource of this transformation. */ public void setResources(ResourceSpec minResources, ResourceSpec preferredResources) { OperatorValidationUtils.validateResourceRequirements(minResources, preferredResources, managedMemoryWeight); this.minResources = checkNotNull(minResources); this.preferredResources = checkNotNull(preferredResources); } /** * Gets the minimum resource of this stream transformation. * * @return The minimum resource of this transformation. */ public ResourceSpec getMinResources() { return minResources; } /** * Gets the preferred resource of this stream transformation. * * @return The preferred resource of this transformation. */ public ResourceSpec getPreferredResources() { return preferredResources; } /** * Set the managed memory weight which indicates how much this transformation relies * on managed memory, so that a transformation highly relies on managed memory would * be able to acquire more managed memory in runtime (linear association). The default * weight value is 1. Note that currently it's only allowed to set the weight in cases * of UNKNOWN resources. * * @param managedMemoryWeight The managed memory weight of this transformation * * @throws IllegalArgumentException Thrown, if non-UNKNOWN resources are already set to this transformation */ public void setManagedMemoryWeight(int managedMemoryWeight) { OperatorValidationUtils.validateResourceRequirements(minResources, preferredResources, managedMemoryWeight); this.managedMemoryWeight = managedMemoryWeight; } /** * Get the managed memory weight which indicates how much this transformation relies * on managed memory, so that a transformation highly relies on managed memory would * be able to acquire more managed memory in runtime (linear association). The default * weight value is 1. Note that it only works in cases of UNKNOWN resources. * * @return The managed memory weight of this transformation */ public int getManagedMemoryWeight() { return managedMemoryWeight; } /** * Sets an user provided hash for this operator. This will be used AS IS the create the * JobVertexID. * * <p>The user provided hash is an alternative to the generated hashes, that is considered when * identifying an operator through the default hash mechanics fails (e.g. because of changes * between Flink versions). * * <p><strong>Important</strong>: this should be used as a workaround or for trouble shooting. * The provided hash needs to be unique per transformation and job. Otherwise, job submission * will fail. Furthermore, you cannot assign user-specified hash to intermediate nodes in an * operator chain and trying so will let your job fail. * * <p>A use case for this is in migration between Flink versions or changing the jobs in a way * that changes the automatically generated hashes. In this case, providing the previous hashes * directly through this method (e.g. obtained from old logs) can help to reestablish a lost * mapping from states to their target operator. * * @param uidHash The user provided hash for this operator. This will become the JobVertexID, which is shown in the * logs and web ui. */ public void setUidHash(String uidHash) { Preconditions.checkNotNull(uidHash); Preconditions.checkArgument(uidHash.matches("^[0-9A-Fa-f]{32}$"), "Node hash must be a 32 character String that describes a hex code. Found: " + uidHash); this.userProvidedNodeHash = uidHash; } /** * Gets the user provided hash. * * @return The user provided hash. */ public String getUserProvidedNodeHash() { return userProvidedNodeHash; } /** * Sets an ID for this {@link Transformation}. This is will later be hashed to a uidHash which is then used to * create the JobVertexID (that is shown in logs and the web ui). * * <p>The specified ID is used to assign the same operator ID across job * submissions (for example when starting a job from a savepoint). * * <p><strong>Important</strong>: this ID needs to be unique per * transformation and job. Otherwise, job submission will fail. * * @param uid The unique user-specified ID of this transformation. */ public void setUid(String uid) { this.uid = uid; } /** * Returns the user-specified ID of this transformation. * * @return The unique user-specified ID of this transformation. */ public String getUid() { return uid; } /** * Returns the slot sharing group of this transformation. * * @see #setSlotSharingGroup(String) */ public String getSlotSharingGroup() { return slotSharingGroup; } /** * Sets the slot sharing group of this transformation. Parallel instances of operations that * are in the same slot sharing group will be co-located in the same TaskManager slot, if * possible. * * <p>Initially, an operation is in the default slot sharing group. This can be explicitly * set using {@code setSlotSharingGroup("default")}. * * @param slotSharingGroup The slot sharing group name. */ public void setSlotSharingGroup(String slotSharingGroup) { this.slotSharingGroup = slotSharingGroup; } /** * <b>NOTE:</b> This is an internal undocumented feature for now. It is not * clear whether this will be supported and stable in the long term. * * <p>Sets the key that identifies the co-location group. * Operators with the same co-location key will have their corresponding subtasks * placed into the same slot by the scheduler. * * <p>Setting this to null means there is no co-location constraint. */ public void setCoLocationGroupKey(@Nullable String coLocationGroupKey) { this.coLocationGroupKey = coLocationGroupKey; } /** * <b>NOTE:</b> This is an internal undocumented feature for now. It is not * clear whether this will be supported and stable in the long term. * * <p>Gets the key that identifies the co-location group. * Operators with the same co-location key will have their corresponding subtasks * placed into the same slot by the scheduler. * * <p>If this is null (which is the default), it means there is no co-location constraint. */ @Nullable public String getCoLocationGroupKey() { return coLocationGroupKey; } /** * Tries to fill in the type information. Type information can be filled in * later when the program uses a type hint. This method checks whether the * type information has ever been accessed before and does not allow * modifications if the type was accessed already. This ensures consistency * by making sure different parts of the operation do not assume different * type information. * * @param outputType The type information to fill in. * * @throws IllegalStateException Thrown, if the type information has been accessed before. */ public void setOutputType(TypeInformation<T> outputType) { if (typeUsed) { throw new IllegalStateException( "TypeInformation cannot be filled in for the type after it has been used. " + "Please make sure that the type info hints are the first call after" + " the transformation function, " + "before any access to types or semantic properties, etc."); } this.outputType = outputType; } /** * Returns the output type of this {@code Transformation} as a {@link TypeInformation}. Once * this is used once the output type cannot be changed anymore using {@link #setOutputType}. * * @return The output type of this {@code Transformation} */ public TypeInformation<T> getOutputType() { if (outputType instanceof MissingTypeInfo) { MissingTypeInfo typeInfo = (MissingTypeInfo) this.outputType; throw new InvalidTypesException( "The return type of function '" + typeInfo.getFunctionName() + "' could not be determined automatically, due to type erasure. " + "You can give type information hints by using the returns(...) " + "method on the result of the transformation call, or by letting " + "your function implement the 'ResultTypeQueryable' " + "interface.", typeInfo.getTypeException()); } typeUsed = true; return this.outputType; } /** * Set the buffer timeout of this {@code Transformation}. The timeout defines how long data * may linger in a partially full buffer before being sent over the network. * * <p>Lower timeouts lead to lower tail latencies, but may affect throughput. * For Flink 1.5+, timeouts of 1ms are feasible for jobs with high parallelism. * * <p>A value of -1 means that the default buffer timeout should be used. A value * of zero indicates that no buffering should happen, and all records/events should be * immediately sent through the network, without additional buffering. */ public void setBufferTimeout(long bufferTimeout) { checkArgument(bufferTimeout >= -1); this.bufferTimeout = bufferTimeout; } /** * Returns the buffer timeout of this {@code Transformation}. * * @see #setBufferTimeout(long) */ public long getBufferTimeout() { return bufferTimeout; } /** * Returns all transitive predecessor {@code Transformation}s of this {@code Transformation}. This * is, for example, used when determining whether a feedback edge of an iteration * actually has the iteration head as a predecessor. * * @return The list of transitive predecessors. */ public abstract Collection<Transformation<?>> getTransitivePredecessors(); @Override public String toString() { return getClass().getSimpleName() + "{" + "id=" + id + ", name='" + name + '\'' + ", outputType=" + outputType + ", parallelism=" + parallelism + '}'; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (!(o instanceof Transformation)) { return false; } Transformation<?> that = (Transformation<?>) o; if (bufferTimeout != that.bufferTimeout) { return false; } if (id != that.id) { return false; } if (parallelism != that.parallelism) { return false; } if (!name.equals(that.name)) { return false; } return outputType != null ? outputType.equals(that.outputType) : that.outputType == null; } @Override public int hashCode() { int result = id; result = 31 * result + name.hashCode(); result = 31 * result + (outputType != null ? outputType.hashCode() : 0); result = 31 * result + parallelism; result = 31 * result + (int) (bufferTimeout ^ (bufferTimeout >>> 32)); return result; } }