org.apache.flink.runtime.execution.SuppressRestartsException Java Examples

The following examples show how to use org.apache.flink.runtime.execution.SuppressRestartsException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ExecutionGraphVariousFailuesTest.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that a {@link SuppressRestartsException} in state RESTARTING stops the restarting
 * immediately and sets the execution graph's state to FAILED.
 */
@Test
public void testSuppressRestartFailureWhileRestarting() throws Exception {
	final ExecutionGraph eg = ExecutionGraphTestUtils.createSimpleTestGraph(new InfiniteDelayRestartStrategy(10));
	eg.start(ComponentMainThreadExecutorServiceAdapter.forMainThread());
	eg.scheduleForExecution();

	assertEquals(JobStatus.RUNNING, eg.getState());
	ExecutionGraphTestUtils.switchAllVerticesToRunning(eg);

	eg.failGlobal(new Exception("test"));
	assertEquals(JobStatus.FAILING, eg.getState());

	ExecutionGraphTestUtils.completeCancellingForAllVertices(eg);
	assertEquals(JobStatus.RESTARTING, eg.getState());

	// suppress a possible restart
	eg.failGlobal(new SuppressRestartsException(new Exception("Test")));

	assertEquals(JobStatus.FAILED, eg.getState());
}
 
Example #2
Source File: ExecutionGraphRestartTest.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Test
public void testNoRestartOnSuppressException() throws Exception {
	final ExecutionGraph eg = createExecutionGraph(new FixedDelayRestartStrategy(Integer.MAX_VALUE, 0)).f0;

	// Fail with unrecoverable Exception
	eg.getAllExecutionVertices().iterator().next().fail(
		new SuppressRestartsException(new Exception("Test Exception")));

	assertEquals(JobStatus.FAILING, eg.getState());

	completeCanceling(eg);

	eg.waitUntilTerminal();
	assertEquals(JobStatus.FAILED, eg.getState());

	RestartStrategy restartStrategy = eg.getRestartStrategy();
	assertTrue(restartStrategy instanceof FixedDelayRestartStrategy);

	assertEquals(0, ((FixedDelayRestartStrategy) restartStrategy).getCurrentRestartAttempt());
}
 
Example #3
Source File: ExecutionGraphVariousFailuesTest.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that a {@link SuppressRestartsException} in state RESTARTING stops the restarting
 * immediately and sets the execution graph's state to FAILED.
 */
@Test
public void testSuppressRestartFailureWhileRestarting() throws Exception {
	final ExecutionGraph eg = ExecutionGraphTestUtils.createSimpleTestGraph(new InfiniteDelayRestartStrategy(10));
	eg.start(TestingComponentMainThreadExecutorServiceAdapter.forMainThread());
	eg.scheduleForExecution();

	assertEquals(JobStatus.RUNNING, eg.getState());
	ExecutionGraphTestUtils.switchAllVerticesToRunning(eg);

	eg.failGlobal(new Exception("test"));
	assertEquals(JobStatus.FAILING, eg.getState());

	ExecutionGraphTestUtils.completeCancellingForAllVertices(eg);
	assertEquals(JobStatus.RESTARTING, eg.getState());

	// suppress a possible restart
	eg.failGlobal(new SuppressRestartsException(new Exception("Test")));

	assertEquals(JobStatus.FAILED, eg.getState());
}
 
Example #4
Source File: ExecutionGraphRestartTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testNoRestartOnSuppressException() throws Exception {
	try (SlotPool slotPool = createSlotPoolImpl()) {
		ExecutionGraph eg = TestingExecutionGraphBuilder.newBuilder()
			.setRestartStrategy(new FixedDelayRestartStrategy(Integer.MAX_VALUE, 0))
			.buildAndScheduleForExecution(slotPool);

		// Fail with unrecoverable Exception
		eg.getAllExecutionVertices().iterator().next().fail(
			new SuppressRestartsException(new Exception("Test Exception")));

		assertEquals(JobStatus.FAILING, eg.getState());

		completeCanceling(eg);

		eg.waitUntilTerminal();
		assertEquals(JobStatus.FAILED, eg.getState());

		RestartStrategy restartStrategy = eg.getRestartStrategy();
		assertTrue(restartStrategy instanceof FixedDelayRestartStrategy);

		assertEquals(0, ((FixedDelayRestartStrategy) restartStrategy).getCurrentRestartAttempt());
	}

}
 
Example #5
Source File: ExecutionGraphVariousFailuesTest.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Tests that a {@link SuppressRestartsException} in state RESTARTING stops the restarting
 * immediately and sets the execution graph's state to FAILED.
 */
@Test
public void testSuppressRestartFailureWhileRestarting() throws Exception {
	final ExecutionGraph eg = ExecutionGraphTestUtils.createSimpleTestGraph(new InfiniteDelayRestartStrategy(10));
	eg.start(ComponentMainThreadExecutorServiceAdapter.forMainThread());
	eg.scheduleForExecution();

	assertEquals(JobStatus.RUNNING, eg.getState());
	ExecutionGraphTestUtils.switchAllVerticesToRunning(eg);

	eg.failGlobal(new Exception("test"));
	assertEquals(JobStatus.FAILING, eg.getState());

	ExecutionGraphTestUtils.completeCancellingForAllVertices(eg);
	assertEquals(JobStatus.RESTARTING, eg.getState());

	// suppress a possible restart
	eg.failGlobal(new SuppressRestartsException(new Exception("Test")));

	assertEquals(JobStatus.FAILED, eg.getState());
}
 
Example #6
Source File: ExecutionFailureHandlerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Tests the check for unrecoverable error.
 */
@Test
public void testUnrecoverableErrorCheck() {
	// normal error
	assertFalse(ExecutionFailureHandler.isUnrecoverableError(new Exception()));

	// direct unrecoverable error
	assertTrue(ExecutionFailureHandler.isUnrecoverableError(new SuppressRestartsException(new Exception())));

	// nested unrecoverable error
	assertTrue(ExecutionFailureHandler.isUnrecoverableError(
		new Exception(new SuppressRestartsException(new Exception()))));
}
 
Example #7
Source File: ExecutionGraphRestartTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testNoRestartOnSuppressException() throws Exception {
	try (SlotPool slotPool = createSlotPoolImpl()) {
		ExecutionGraph eg = TestingExecutionGraphBuilder
			.newBuilder()
			.setJobGraph(createJobGraph())
			.setRestartStrategy(new FixedDelayRestartStrategy(Integer.MAX_VALUE, 0))
			.setSlotProvider(createSchedulerWithSlots(slotPool))
			.build();

		startAndScheduleExecutionGraph(eg);

		// Fail with unrecoverable Exception
		eg.getAllExecutionVertices().iterator().next().fail(
			new SuppressRestartsException(new Exception("Test Exception")));

		assertEquals(JobStatus.FAILING, eg.getState());

		completeCanceling(eg);

		eg.waitUntilTerminal();
		assertEquals(JobStatus.FAILED, eg.getState());

		RestartStrategy restartStrategy = eg.getRestartStrategy();
		assertTrue(restartStrategy instanceof FixedDelayRestartStrategy);

		assertEquals(0, ((FixedDelayRestartStrategy) restartStrategy).getCurrentRestartAttempt());
	}

}
 
Example #8
Source File: ExecutionGraphRestartTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testFailWhileRestarting() throws Exception {
	try (SlotPool slotPool = createSlotPoolImpl()) {
		TaskManagerLocation taskManagerLocation = new LocalTaskManagerLocation();
		final ExecutionGraph executionGraph = TestingExecutionGraphBuilder
			.newBuilder()
			.setJobGraph(createJobGraph())
			.setRestartStrategy(new InfiniteDelayRestartStrategy())
			.setSlotProvider(createSchedulerWithSlots(slotPool, taskManagerLocation))
			.build();

		startAndScheduleExecutionGraph(executionGraph);

		// Release the TaskManager and wait for the job to restart
		slotPool.releaseTaskManager(taskManagerLocation.getResourceID(), new Exception("Test Exception"));

		assertEquals(JobStatus.RESTARTING, executionGraph.getState());

		// If we fail when being in RESTARTING, then we should try to restart again
		final long globalModVersion = executionGraph.getGlobalModVersion();
		final Exception testException = new Exception("Test exception");
		executionGraph.failGlobal(testException);

		assertNotEquals(globalModVersion, executionGraph.getGlobalModVersion());
		assertEquals(JobStatus.RESTARTING, executionGraph.getState());
		assertEquals(testException, executionGraph.getFailureCause()); // we should have updated the failure cause

		// but it should fail when sending a SuppressRestartsException
		executionGraph.failGlobal(new SuppressRestartsException(new Exception("Suppress restart exception")));

		assertEquals(JobStatus.FAILED, executionGraph.getState());

		// The restart has been aborted
		executionGraph.restart(executionGraph.getGlobalModVersion());

		assertEquals(JobStatus.FAILED, executionGraph.getState());
	}
}
 
Example #9
Source File: ContinuousFileProcessingCheckpointITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public void invoke(String value) throws Exception {
	int fileIdx = getFileIdx(value);

	Set<String> content = actualContent.get(fileIdx);
	if (content == null) {
		content = new HashSet<>();
		actualContent.put(fileIdx, content);
	}

	// detect duplicate lines.
	if (!content.add(value + "\n")) {
		fail("Duplicate line: " + value);
		System.exit(0);
	}

	elementCounter++;

	// this is termination
	if (elementCounter >= NO_OF_FILES * LINES_PER_FILE) {
		actualCollectedContent = actualContent;
		throw new SuppressRestartsException(new SuccessException());
	}

	// add some latency so that we have at least two checkpoint in
	if (!hasRestoredAfterFailure && successfulCheckpoints < 2) {
		Thread.sleep(5);
	}

	// simulate a node failure
	if (!hasRestoredAfterFailure && successfulCheckpoints >= 2 && elementCounter >= elementsToFailure) {
		throw new Exception("Task Failure @ elem: " + elementCounter + " / " + elementsToFailure);
	}
}
 
Example #10
Source File: ThrowableClassifierTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testThrowableType_NonRecoverable() {
	assertEquals(ThrowableType.NonRecoverableError,
		ThrowableClassifier.getThrowableType(new SuppressRestartsException(new Exception(""))));

	assertEquals(ThrowableType.NonRecoverableError,
		ThrowableClassifier.getThrowableType(new NoResourceAvailableException()));
}
 
Example #11
Source File: ExecutionFailureHandlerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Tests the check for unrecoverable error.
 */
@Test
public void testUnrecoverableErrorCheck() {
	// normal error
	assertFalse(ExecutionFailureHandler.isUnrecoverableError(new Exception()));

	// direct unrecoverable error
	assertTrue(ExecutionFailureHandler.isUnrecoverableError(new SuppressRestartsException(new Exception())));

	// nested unrecoverable error
	assertTrue(ExecutionFailureHandler.isUnrecoverableError(
		new Exception(new SuppressRestartsException(new Exception()))));
}
 
Example #12
Source File: ExecutionGraphRestartTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testFailWhileRestarting() throws Exception {
	try (SlotPool slotPool = createSlotPoolImpl()) {
		TaskManagerLocation taskManagerLocation = new LocalTaskManagerLocation();
		final ExecutionGraph executionGraph = TestingExecutionGraphBuilder.newBuilder()
			.setRestartStrategy(new InfiniteDelayRestartStrategy())
			.setTaskManagerLocation(taskManagerLocation)
			.buildAndScheduleForExecution(slotPool);

		// Release the TaskManager and wait for the job to restart
		slotPool.releaseTaskManager(taskManagerLocation.getResourceID(), new Exception("Test Exception"));

		assertEquals(JobStatus.RESTARTING, executionGraph.getState());

		// If we fail when being in RESTARTING, then we should try to restart again
		final long globalModVersion = executionGraph.getGlobalModVersion();
		final Exception testException = new Exception("Test exception");
		executionGraph.failGlobal(testException);

		assertNotEquals(globalModVersion, executionGraph.getGlobalModVersion());
		assertEquals(JobStatus.RESTARTING, executionGraph.getState());
		assertEquals(testException, executionGraph.getFailureCause()); // we should have updated the failure cause

		// but it should fail when sending a SuppressRestartsException
		executionGraph.failGlobal(new SuppressRestartsException(new Exception("Suppress restart exception")));

		assertEquals(JobStatus.FAILED, executionGraph.getState());

		// The restart has been aborted
		executionGraph.restart(executionGraph.getGlobalModVersion());

		assertEquals(JobStatus.FAILED, executionGraph.getState());
	}
}
 
Example #13
Source File: ThrowableClassifierTest.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testThrowableType_NonRecoverable() {
	assertEquals(ThrowableType.NonRecoverableError,
		ThrowableClassifier.getThrowableType(new SuppressRestartsException(new Exception(""))));

	assertEquals(ThrowableType.NonRecoverableError,
		ThrowableClassifier.getThrowableType(new NoResourceAvailableException()));
}
 
Example #14
Source File: ConcurrentFailoverStrategyExecutionGraphTest.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that a terminal global failure concurrent to a local failover
 * leads to a properly failed state.
 */
@Test
public void testGlobalFailureConcurrentToLocalFailover() throws Exception {

	// the logic in this test is as follows:
	//  - start a job
	//  - cause a task failure and delay the local recovery action via the manual executor
	//  - cause a global failure
	//  - resume in local recovery action
	//  - validate that this does in fact not start a new task, because the graph as a
	//    whole should now be terminally failed already

	final JobID jid = new JobID();
	final int parallelism = 2;

	final SimpleSlotProvider slotProvider = new SimpleSlotProvider(jid, parallelism);

	final ExecutionGraph graph = createSampleGraph(
		jid,
		TestRestartPipelinedRegionStrategy::new,
		TestRestartStrategy.directExecuting(),
		slotProvider,
		parallelism);

	graph.start(mainThreadExecutor);
	TestRestartPipelinedRegionStrategy strategy = (TestRestartPipelinedRegionStrategy) graph.getFailoverStrategy();

	// This future is used to block the failover strategy execution until we complete it
	final CompletableFuture<?> blocker = new CompletableFuture<>();
	strategy.setBlockerFuture(blocker);

	final ExecutionJobVertex ejv = graph.getVerticesTopologically().iterator().next();
	final ExecutionVertex vertex1 = ejv.getTaskVertices()[0];
	final ExecutionVertex vertex2 = ejv.getTaskVertices()[1];

	graph.scheduleForExecution();
	assertEquals(JobStatus.RUNNING, graph.getState());

	// let one of the vertices fail - that triggers a local recovery action
	vertex1.getCurrentExecutionAttempt().fail(new Exception("test failure"));
	assertEquals(ExecutionState.FAILED, vertex1.getCurrentExecutionAttempt().getState());

	// graph should still be running and the failover recovery action should be queued
	assertEquals(JobStatus.RUNNING, graph.getState());

	// now cancel the job
	graph.failGlobal(new SuppressRestartsException(new Exception("test exception")));

	assertEquals(JobStatus.FAILING, graph.getState());
	assertEquals(ExecutionState.FAILED, vertex1.getCurrentExecutionAttempt().getState());
	assertEquals(ExecutionState.CANCELING, vertex2.getCurrentExecutionAttempt().getState());

	// let the recovery action continue
	blocker.complete(null);

	// now report that cancelling is complete for the other vertex
	vertex2.getCurrentExecutionAttempt().completeCancelling();

	assertEquals(JobStatus.FAILED, graph.getState());
	assertTrue(vertex1.getCurrentExecutionAttempt().getState().isTerminal());
	assertTrue(vertex2.getCurrentExecutionAttempt().getState().isTerminal());

	// make sure all slots are recycled
	assertEquals(parallelism, slotProvider.getNumberOfAvailableSlots());
}
 
Example #15
Source File: ExecutionGraph.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Try to restart the job. If we cannot restart the job (e.g. no more restarts allowed), then
 * try to fail the job. This operation is only permitted if the current state is FAILING or
 * RESTARTING.
 *
 * @return true if the operation could be executed; false if a concurrent job status change occurred
 */
private boolean tryRestartOrFail(long globalModVersionForRestart) {
	JobStatus currentState = state;

	if (currentState == JobStatus.FAILING || currentState == JobStatus.RESTARTING) {
		final Throwable failureCause = this.failureCause;

		synchronized (progressLock) {
			if (LOG.isDebugEnabled()) {
				LOG.debug("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID(), failureCause);
			} else {
				LOG.info("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID());
			}

			final boolean isFailureCauseAllowingRestart = !(failureCause instanceof SuppressRestartsException);
			final boolean isRestartStrategyAllowingRestart = restartStrategy.canRestart();
			boolean isRestartable = isFailureCauseAllowingRestart && isRestartStrategyAllowingRestart;

			if (isRestartable && transitionState(currentState, JobStatus.RESTARTING)) {
				LOG.info("Restarting the job {} ({}).", getJobName(), getJobID());

				RestartCallback restarter = new ExecutionGraphRestartCallback(this, globalModVersionForRestart);
				FutureUtils.assertNoException(
					restartStrategy
						.restart(restarter, getJobMasterMainThreadExecutor())
						.exceptionally((throwable) -> {
							failGlobal(throwable);
							return null;
						}));
				return true;
			}
			else if (!isRestartable && transitionState(currentState, JobStatus.FAILED, failureCause)) {
				final String cause1 = isFailureCauseAllowingRestart ? null :
					"a type of SuppressRestartsException was thrown";
				final String cause2 = isRestartStrategyAllowingRestart ? null :
					"the restart strategy prevented it";

				LOG.info("Could not restart the job {} ({}) because {}.", getJobName(), getJobID(),
					StringUtils.concatenateWithAnd(cause1, cause2), failureCause);
				onTerminalState(JobStatus.FAILED);

				return true;
			} else {
				// we must have changed the state concurrently, thus we cannot complete this operation
				return false;
			}
		}
	} else {
		// this operation is only allowed in the state FAILING or RESTARTING
		return false;
	}
}
 
Example #16
Source File: ExecutionGraph.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Try to restart the job. If we cannot restart the job (e.g. no more restarts allowed), then
 * try to fail the job. This operation is only permitted if the current state is FAILING or
 * RESTARTING.
 *
 * @return true if the operation could be executed; false if a concurrent job status change occurred
 */
@Deprecated
private boolean tryRestartOrFail(long globalModVersionForRestart) {
	if (!isLegacyScheduling()) {
		return true;
	}

	JobStatus currentState = state;

	if (currentState == JobStatus.FAILING || currentState == JobStatus.RESTARTING) {
		final Throwable failureCause = this.failureCause;

		if (LOG.isDebugEnabled()) {
			LOG.debug("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID(), failureCause);
		} else {
			LOG.info("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID());
		}

		final boolean isFailureCauseAllowingRestart = !(failureCause instanceof SuppressRestartsException);
		final boolean isRestartStrategyAllowingRestart = restartStrategy.canRestart();
		boolean isRestartable = isFailureCauseAllowingRestart && isRestartStrategyAllowingRestart;

		if (isRestartable && transitionState(currentState, JobStatus.RESTARTING)) {
			LOG.info("Restarting the job {} ({}).", getJobName(), getJobID());

			RestartCallback restarter = new ExecutionGraphRestartCallback(this, globalModVersionForRestart);
			FutureUtils.assertNoException(
				restartStrategy
					.restart(restarter, getJobMasterMainThreadExecutor())
					.exceptionally((throwable) -> {
							failGlobal(throwable);
							return null;
						}));
			return true;
		}
		else if (!isRestartable && transitionState(currentState, JobStatus.FAILED, failureCause)) {
			final String cause1 = isFailureCauseAllowingRestart ? null :
				"a type of SuppressRestartsException was thrown";
			final String cause2 = isRestartStrategyAllowingRestart ? null :
				"the restart strategy prevented it";

			LOG.info("Could not restart the job {} ({}) because {}.", getJobName(), getJobID(),
				StringUtils.concatenateWithAnd(cause1, cause2), failureCause);
			onTerminalState(JobStatus.FAILED);

			return true;
		} else {
			// we must have changed the state concurrently, thus we cannot complete this operation
			return false;
		}
	} else {
		// this operation is only allowed in the state FAILING or RESTARTING
		return false;
	}
}
 
Example #17
Source File: ThrowableClassifierTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testThrowableType_NonRecoverable() {
	assertEquals(ThrowableType.NonRecoverableError,
		ThrowableClassifier.getThrowableType(new SuppressRestartsException(new Exception(""))));
}
 
Example #18
Source File: ExecutionGraphRestartTest.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
@Test
public void testFailWhileRestarting() throws Exception {
	Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());

	Instance instance = ExecutionGraphTestUtils.getInstance(
		new ActorTaskManagerGateway(
			new SimpleActorGateway(TestingUtils.directExecutionContext())),
		NUM_TASKS);

	scheduler.newInstanceAvailable(instance);

	// Blocking program
	ExecutionGraph executionGraph = new ExecutionGraph(
		TestingUtils.defaultExecutor(),
		TestingUtils.defaultExecutor(),
		new JobID(),
		"TestJob",
		new Configuration(),
		new SerializedValue<>(new ExecutionConfig()),
		AkkaUtils.getDefaultTimeout(),
		// We want to manually control the restart and delay
		new InfiniteDelayRestartStrategy(),
		scheduler);

	executionGraph.start(TestingComponentMainThreadExecutorServiceAdapter.forMainThread());

	JobVertex jobVertex = new JobVertex("NoOpInvokable");
	jobVertex.setInvokableClass(NoOpInvokable.class);
	jobVertex.setParallelism(NUM_TASKS);

	JobGraph jobGraph = new JobGraph("TestJob", jobVertex);

	executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());

	assertEquals(JobStatus.CREATED, executionGraph.getState());

	executionGraph.scheduleForExecution();

	assertEquals(JobStatus.RUNNING, executionGraph.getState());

	// Kill the instance and wait for the job to restart
	instance.markDead();

	assertEquals(JobStatus.RESTARTING, executionGraph.getState());

	// If we fail when being in RESTARTING, then we should try to restart again
	final long globalModVersion = executionGraph.getGlobalModVersion();
	final Exception testException = new Exception("Test exception");
	executionGraph.failGlobal(testException);

	assertNotEquals(globalModVersion, executionGraph.getGlobalModVersion());
	assertEquals(JobStatus.RESTARTING, executionGraph.getState());
	assertEquals(testException, executionGraph.getFailureCause()); // we should have updated the failure cause

	// but it should fail when sending a SuppressRestartsException
	executionGraph.failGlobal(new SuppressRestartsException(new Exception("Suppress restart exception")));

	assertEquals(JobStatus.FAILED, executionGraph.getState());

	// The restart has been aborted
	executionGraph.restart(executionGraph.getGlobalModVersion());

	assertEquals(JobStatus.FAILED, executionGraph.getState());
}
 
Example #19
Source File: ConcurrentFailoverStrategyExecutionGraphTest.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that a terminal global failure concurrent to a local failover
 * leads to a properly failed state.
 */
@Test
public void testGlobalFailureConcurrentToLocalFailover() throws Exception {

	// the logic in this test is as follows:
	//  - start a job
	//  - cause a task failure and delay the local recovery action via the manual executor
	//  - cause a global failure
	//  - resume in local recovery action
	//  - validate that this does in fact not start a new task, because the graph as a
	//    whole should now be terminally failed already

	final JobID jid = new JobID();
	final int parallelism = 2;

	final SimpleSlotProvider slotProvider = new SimpleSlotProvider(jid, parallelism);

	final ExecutionGraph graph = createSampleGraph(
		jid,
		TestRestartPipelinedRegionStrategy::new,
		TestRestartStrategy.directExecuting(),
		slotProvider,
		parallelism);

	graph.start(mainThreadExecutor);
	TestRestartPipelinedRegionStrategy strategy = (TestRestartPipelinedRegionStrategy) graph.getFailoverStrategy();

	// This future is used to block the failover strategy execution until we complete it
	final CompletableFuture<?> blocker = new CompletableFuture<>();
	strategy.setBlockerFuture(blocker);

	final ExecutionJobVertex ejv = graph.getVerticesTopologically().iterator().next();
	final ExecutionVertex vertex1 = ejv.getTaskVertices()[0];
	final ExecutionVertex vertex2 = ejv.getTaskVertices()[1];

	graph.scheduleForExecution();
	assertEquals(JobStatus.RUNNING, graph.getState());

	// let one of the vertices fail - that triggers a local recovery action
	vertex1.getCurrentExecutionAttempt().fail(new Exception("test failure"));
	assertEquals(ExecutionState.FAILED, vertex1.getCurrentExecutionAttempt().getState());

	// graph should still be running and the failover recovery action should be queued
	assertEquals(JobStatus.RUNNING, graph.getState());

	// now cancel the job
	graph.failGlobal(new SuppressRestartsException(new Exception("test exception")));

	assertEquals(JobStatus.FAILING, graph.getState());
	assertEquals(ExecutionState.FAILED, vertex1.getCurrentExecutionAttempt().getState());
	assertEquals(ExecutionState.CANCELING, vertex2.getCurrentExecutionAttempt().getState());

	// let the recovery action continue
	blocker.complete(null);

	// now report that cancelling is complete for the other vertex
	vertex2.getCurrentExecutionAttempt().completeCancelling();

	assertEquals(JobStatus.FAILED, graph.getState());
	assertTrue(vertex1.getCurrentExecutionAttempt().getState().isTerminal());
	assertTrue(vertex2.getCurrentExecutionAttempt().getState().isTerminal());

	// make sure all slots are recycled
	assertEquals(parallelism, slotProvider.getNumberOfAvailableSlots());
}
 
Example #20
Source File: ExecutionGraph.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Try to restart the job. If we cannot restart the job (e.g. no more restarts allowed), then
 * try to fail the job. This operation is only permitted if the current state is FAILING or
 * RESTARTING.
 *
 * @return true if the operation could be executed; false if a concurrent job status change occurred
 */
private boolean tryRestartOrFail(long globalModVersionForRestart) {
	JobStatus currentState = state;

	if (currentState == JobStatus.FAILING || currentState == JobStatus.RESTARTING) {
		final Throwable failureCause = this.failureCause;

		synchronized (progressLock) {
			if (LOG.isDebugEnabled()) {
				LOG.debug("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID(), failureCause);
			} else {
				LOG.info("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID());
			}

			final boolean isFailureCauseAllowingRestart = !(failureCause instanceof SuppressRestartsException);
			final boolean isRestartStrategyAllowingRestart = restartStrategy.canRestart();
			boolean isRestartable = isFailureCauseAllowingRestart && isRestartStrategyAllowingRestart;

			if (isRestartable && transitionState(currentState, JobStatus.RESTARTING)) {
				LOG.info("Restarting the job {} ({}).", getJobName(), getJobID());

				RestartCallback restarter = new ExecutionGraphRestartCallback(this, globalModVersionForRestart);
				restartStrategy.restart(restarter, getJobMasterMainThreadExecutor());

				return true;
			}
			else if (!isRestartable && transitionState(currentState, JobStatus.FAILED, failureCause)) {
				final String cause1 = isFailureCauseAllowingRestart ? null :
					"a type of SuppressRestartsException was thrown";
				final String cause2 = isRestartStrategyAllowingRestart ? null :
					"the restart strategy prevented it";

				LOG.info("Could not restart the job {} ({}) because {}.", getJobName(), getJobID(),
					StringUtils.concatenateWithAnd(cause1, cause2), failureCause);
				onTerminalState(JobStatus.FAILED);

				return true;
			} else {
				// we must have changed the state concurrently, thus we cannot complete this operation
				return false;
			}
		}
	} else {
		// this operation is only allowed in the state FAILING or RESTARTING
		return false;
	}
}