Java Code Examples for org.apache.flink.runtime.jobgraph.JobStatus#FAILING

The following examples show how to use org.apache.flink.runtime.jobgraph.JobStatus#FAILING . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ExecutionGraph.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * This method is a callback during cancellation/failover and called when all tasks
 * have reached a terminal state (cancelled/failed/finished).
 */
private void allVerticesInTerminalState(long expectedGlobalVersionForRestart) {

	assertRunningInJobMasterMainThread();

	// we are done, transition to the final state
	JobStatus current;
	while (true) {
		current = this.state;

		if (current == JobStatus.RUNNING) {
			failGlobal(new Exception("ExecutionGraph went into allVerticesInTerminalState() from RUNNING"));
		}
		else if (current == JobStatus.CANCELLING) {
			if (transitionState(current, JobStatus.CANCELED)) {
				onTerminalState(JobStatus.CANCELED);
				break;
			}
		}
		else if (current == JobStatus.FAILING) {
			if (tryRestartOrFail(expectedGlobalVersionForRestart)) {
				break;
			}
			// concurrent job status change, let's check again
		}
		else if (current.isGloballyTerminalState()) {
			LOG.warn("Job has entered globally terminal state without waiting for all " +
				"job vertices to reach final state.");
			break;
		}
		else {
			failGlobal(new Exception("ExecutionGraph went into final state from state " + current));
			break;
		}
	}
	// done transitioning the state
}
 
Example 2
Source File: ExecutionGraph.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * This method is a callback during cancellation/failover and called when all tasks
 * have reached a terminal state (cancelled/failed/finished).
 */
private void allVerticesInTerminalState(long expectedGlobalVersionForRestart) {

	assertRunningInJobMasterMainThread();

	// we are done, transition to the final state
	JobStatus current;
	while (true) {
		current = this.state;

		if (current == JobStatus.RUNNING) {
			failGlobal(new Exception("ExecutionGraph went into allVerticesInTerminalState() from RUNNING"));
		}
		else if (current == JobStatus.CANCELLING) {
			if (transitionState(current, JobStatus.CANCELED)) {
				onTerminalState(JobStatus.CANCELED);
				break;
			}
		}
		else if (current == JobStatus.FAILING) {
			if (tryRestartOrFail(expectedGlobalVersionForRestart)) {
				break;
			}
			// concurrent job status change, let's check again
		}
		else if (current.isGloballyTerminalState()) {
			LOG.warn("Job has entered globally terminal state without waiting for all " +
				"job vertices to reach final state.");
			break;
		}
		else {
			failGlobal(new Exception("ExecutionGraph went into final state from state " + current));
			break;
		}
	}
	// done transitioning the state
}
 
Example 3
Source File: ExecutionGraph.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
public void cancel() {

		assertRunningInJobMasterMainThread();

		while (true) {
			JobStatus current = state;

			if (current == JobStatus.RUNNING || current == JobStatus.CREATED) {
				if (transitionState(current, JobStatus.CANCELLING)) {

					// make sure no concurrent local actions interfere with the cancellation
					final long globalVersionForRestart = incrementGlobalModVersion();

					final CompletableFuture<Void> ongoingSchedulingFuture = schedulingFuture;

					// cancel ongoing scheduling action
					if (ongoingSchedulingFuture != null) {
						ongoingSchedulingFuture.cancel(false);
					}

					final ArrayList<CompletableFuture<?>> futures = new ArrayList<>(verticesInCreationOrder.size());

					// cancel all tasks (that still need cancelling)
					for (ExecutionJobVertex ejv : verticesInCreationOrder) {
						futures.add(ejv.cancelWithFuture());
					}

					// we build a future that is complete once all vertices have reached a terminal state
					final ConjunctFuture<Void> allTerminal = FutureUtils.waitForAll(futures);
					allTerminal.whenComplete(
						(Void value, Throwable throwable) -> {
							if (throwable != null) {
								transitionState(
									JobStatus.CANCELLING,
									JobStatus.FAILED,
									new FlinkException(
										"Could not cancel job " + getJobName() + " because not all execution job vertices could be cancelled.",
										throwable));
							} else {
								// cancellations may currently be overridden by failures which trigger
								// restarts, so we need to pass a proper restart global version here
								allVerticesInTerminalState(globalVersionForRestart);
							}
						});

					return;
				}
			}
			// Executions are being canceled. Go into cancelling and wait for
			// all vertices to be in their final state.
			else if (current == JobStatus.FAILING) {
				if (transitionState(current, JobStatus.CANCELLING)) {
					return;
				}
			}
			// All vertices have been cancelled and it's safe to directly go
			// into the canceled state.
			else if (current == JobStatus.RESTARTING) {
				synchronized (progressLock) {
					if (transitionState(current, JobStatus.CANCELED)) {
						onTerminalState(JobStatus.CANCELED);

						LOG.info("Canceled during restart.");
						return;
					}
				}
			}
			else {
				// no need to treat other states
				return;
			}
		}
	}
 
Example 4
Source File: ExecutionGraph.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Fails the execution graph globally. This failure will not be recovered by a specific
 * failover strategy, but results in a full restart of all tasks.
 *
 * <p>This global failure is meant to be triggered in cases where the consistency of the
 * execution graph' state cannot be guaranteed any more (for example when catching unexpected
 * exceptions that indicate a bug or an unexpected call race), and where a full restart is the
 * safe way to get consistency back.
 *
 * @param t The exception that caused the failure.
 */
public void failGlobal(Throwable t) {

	assertRunningInJobMasterMainThread();

	while (true) {
		JobStatus current = state;
		// stay in these states
		if (current == JobStatus.FAILING ||
			current == JobStatus.SUSPENDED ||
			current.isGloballyTerminalState()) {
			return;
		} else if (transitionState(current, JobStatus.FAILING, t)) {
			initFailureCause(t);

			// make sure no concurrent local or global actions interfere with the failover
			final long globalVersionForRestart = incrementGlobalModVersion();

			final CompletableFuture<Void> ongoingSchedulingFuture = schedulingFuture;

			// cancel ongoing scheduling action
			if (ongoingSchedulingFuture != null) {
				ongoingSchedulingFuture.cancel(false);
			}

			// we build a future that is complete once all vertices have reached a terminal state
			final ArrayList<CompletableFuture<?>> futures = new ArrayList<>(verticesInCreationOrder.size());

			// cancel all tasks (that still need cancelling)
			for (ExecutionJobVertex ejv : verticesInCreationOrder) {
				futures.add(ejv.cancelWithFuture());
			}

			final ConjunctFuture<Void> allTerminal = FutureUtils.waitForAll(futures);
			allTerminal.whenComplete(
				(Void ignored, Throwable throwable) -> {
					if (throwable != null) {
						transitionState(
							JobStatus.FAILING,
							JobStatus.FAILED,
							new FlinkException("Could not cancel all execution job vertices properly.", throwable));
					} else {
						allVerticesInTerminalState(globalVersionForRestart);
					}
				});

			return;
		}

		// else: concurrent change to execution state, retry
	}
}
 
Example 5
Source File: ExecutionGraph.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Try to restart the job. If we cannot restart the job (e.g. no more restarts allowed), then
 * try to fail the job. This operation is only permitted if the current state is FAILING or
 * RESTARTING.
 *
 * @return true if the operation could be executed; false if a concurrent job status change occurred
 */
private boolean tryRestartOrFail(long globalModVersionForRestart) {
	JobStatus currentState = state;

	if (currentState == JobStatus.FAILING || currentState == JobStatus.RESTARTING) {
		final Throwable failureCause = this.failureCause;

		synchronized (progressLock) {
			if (LOG.isDebugEnabled()) {
				LOG.debug("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID(), failureCause);
			} else {
				LOG.info("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID());
			}

			final boolean isFailureCauseAllowingRestart = !(failureCause instanceof SuppressRestartsException);
			final boolean isRestartStrategyAllowingRestart = restartStrategy.canRestart();
			boolean isRestartable = isFailureCauseAllowingRestart && isRestartStrategyAllowingRestart;

			if (isRestartable && transitionState(currentState, JobStatus.RESTARTING)) {
				LOG.info("Restarting the job {} ({}).", getJobName(), getJobID());

				RestartCallback restarter = new ExecutionGraphRestartCallback(this, globalModVersionForRestart);
				restartStrategy.restart(restarter, getJobMasterMainThreadExecutor());

				return true;
			}
			else if (!isRestartable && transitionState(currentState, JobStatus.FAILED, failureCause)) {
				final String cause1 = isFailureCauseAllowingRestart ? null :
					"a type of SuppressRestartsException was thrown";
				final String cause2 = isRestartStrategyAllowingRestart ? null :
					"the restart strategy prevented it";

				LOG.info("Could not restart the job {} ({}) because {}.", getJobName(), getJobID(),
					StringUtils.concatenateWithAnd(cause1, cause2), failureCause);
				onTerminalState(JobStatus.FAILED);

				return true;
			} else {
				// we must have changed the state concurrently, thus we cannot complete this operation
				return false;
			}
		}
	} else {
		// this operation is only allowed in the state FAILING or RESTARTING
		return false;
	}
}
 
Example 6
Source File: ExecutionGraph.java    From flink with Apache License 2.0 4 votes vote down vote up
public void cancel() {

		assertRunningInJobMasterMainThread();

		while (true) {
			JobStatus current = state;

			if (current == JobStatus.RUNNING || current == JobStatus.CREATED) {
				if (transitionState(current, JobStatus.CANCELLING)) {

					// make sure no concurrent local actions interfere with the cancellation
					final long globalVersionForRestart = incrementGlobalModVersion();

					final CompletableFuture<Void> ongoingSchedulingFuture = schedulingFuture;

					// cancel ongoing scheduling action
					if (ongoingSchedulingFuture != null) {
						ongoingSchedulingFuture.cancel(false);
					}

					final ConjunctFuture<Void> allTerminal = cancelVerticesAsync();
					allTerminal.whenComplete(
						(Void value, Throwable throwable) -> {
							if (throwable != null) {
								transitionState(
									JobStatus.CANCELLING,
									JobStatus.FAILED,
									new FlinkException(
										"Could not cancel job " + getJobName() + " because not all execution job vertices could be cancelled.",
										throwable));
							} else {
								// cancellations may currently be overridden by failures which trigger
								// restarts, so we need to pass a proper restart global version here
								allVerticesInTerminalState(globalVersionForRestart);
							}
						});

					return;
				}
			}
			// Executions are being canceled. Go into cancelling and wait for
			// all vertices to be in their final state.
			else if (current == JobStatus.FAILING) {
				if (transitionState(current, JobStatus.CANCELLING)) {
					return;
				}
			}
			// All vertices have been cancelled and it's safe to directly go
			// into the canceled state.
			else if (current == JobStatus.RESTARTING) {
				synchronized (progressLock) {
					if (transitionState(current, JobStatus.CANCELED)) {
						onTerminalState(JobStatus.CANCELED);

						LOG.info("Canceled during restart.");
						return;
					}
				}
			}
			else {
				// no need to treat other states
				return;
			}
		}
	}
 
Example 7
Source File: ExecutionGraph.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Fails the execution graph globally. This failure will not be recovered by a specific
 * failover strategy, but results in a full restart of all tasks.
 *
 * <p>This global failure is meant to be triggered in cases where the consistency of the
 * execution graph' state cannot be guaranteed any more (for example when catching unexpected
 * exceptions that indicate a bug or an unexpected call race), and where a full restart is the
 * safe way to get consistency back.
 *
 * @param t The exception that caused the failure.
 */
public void failGlobal(Throwable t) {

	assertRunningInJobMasterMainThread();

	while (true) {
		JobStatus current = state;
		// stay in these states
		if (current == JobStatus.FAILING ||
			current == JobStatus.SUSPENDED ||
			current.isGloballyTerminalState()) {
			return;
		} else if (transitionState(current, JobStatus.FAILING, t)) {
			initFailureCause(t);

			// make sure no concurrent local or global actions interfere with the failover
			final long globalVersionForRestart = incrementGlobalModVersion();

			final CompletableFuture<Void> ongoingSchedulingFuture = schedulingFuture;

			// cancel ongoing scheduling action
			if (ongoingSchedulingFuture != null) {
				ongoingSchedulingFuture.cancel(false);
			}

			// we build a future that is complete once all vertices have reached a terminal state
			final ConjunctFuture<Void> allTerminal = cancelVerticesAsync();
			FutureUtils.assertNoException(allTerminal.handle(
				(Void ignored, Throwable throwable) -> {
					if (throwable != null) {
						transitionState(
							JobStatus.FAILING,
							JobStatus.FAILED,
							new FlinkException("Could not cancel all execution job vertices properly.", throwable));
					} else {
						allVerticesInTerminalState(globalVersionForRestart);
					}
					return null;
				}));

			return;
		}

		// else: concurrent change to execution state, retry
	}
}
 
Example 8
Source File: ExecutionGraph.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Try to restart the job. If we cannot restart the job (e.g. no more restarts allowed), then
 * try to fail the job. This operation is only permitted if the current state is FAILING or
 * RESTARTING.
 *
 * @return true if the operation could be executed; false if a concurrent job status change occurred
 */
private boolean tryRestartOrFail(long globalModVersionForRestart) {
	JobStatus currentState = state;

	if (currentState == JobStatus.FAILING || currentState == JobStatus.RESTARTING) {
		final Throwable failureCause = this.failureCause;

		synchronized (progressLock) {
			if (LOG.isDebugEnabled()) {
				LOG.debug("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID(), failureCause);
			} else {
				LOG.info("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID());
			}

			final boolean isFailureCauseAllowingRestart = !(failureCause instanceof SuppressRestartsException);
			final boolean isRestartStrategyAllowingRestart = restartStrategy.canRestart();
			boolean isRestartable = isFailureCauseAllowingRestart && isRestartStrategyAllowingRestart;

			if (isRestartable && transitionState(currentState, JobStatus.RESTARTING)) {
				LOG.info("Restarting the job {} ({}).", getJobName(), getJobID());

				RestartCallback restarter = new ExecutionGraphRestartCallback(this, globalModVersionForRestart);
				FutureUtils.assertNoException(
					restartStrategy
						.restart(restarter, getJobMasterMainThreadExecutor())
						.exceptionally((throwable) -> {
							failGlobal(throwable);
							return null;
						}));
				return true;
			}
			else if (!isRestartable && transitionState(currentState, JobStatus.FAILED, failureCause)) {
				final String cause1 = isFailureCauseAllowingRestart ? null :
					"a type of SuppressRestartsException was thrown";
				final String cause2 = isRestartStrategyAllowingRestart ? null :
					"the restart strategy prevented it";

				LOG.info("Could not restart the job {} ({}) because {}.", getJobName(), getJobID(),
					StringUtils.concatenateWithAnd(cause1, cause2), failureCause);
				onTerminalState(JobStatus.FAILED);

				return true;
			} else {
				// we must have changed the state concurrently, thus we cannot complete this operation
				return false;
			}
		}
	} else {
		// this operation is only allowed in the state FAILING or RESTARTING
		return false;
	}
}