org.apache.flink.runtime.checkpoint.CheckpointCoordinator Java Examples

The following examples show how to use org.apache.flink.runtime.checkpoint.CheckpointCoordinator. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ExecutionGraph.java    From flink with Apache License 2.0 6 votes vote down vote up
private void onTerminalState(JobStatus status) {
	try {
		CheckpointCoordinator coord = this.checkpointCoordinator;
		this.checkpointCoordinator = null;
		if (coord != null) {
			coord.shutdown(status);
		}
		if (checkpointCoordinatorTimer != null) {
			checkpointCoordinatorTimer.shutdownNow();
			checkpointCoordinatorTimer = null;
		}
	}
	catch (Exception e) {
		LOG.error("Error while cleaning up after execution", e);
	}
	finally {
		terminationFuture.complete(status);
	}
}
 
Example #2
Source File: SchedulerBase.java    From flink with Apache License 2.0 6 votes vote down vote up
private ExecutionGraph createAndRestoreExecutionGraph(
	JobManagerJobMetricGroup currentJobManagerJobMetricGroup,
	ShuffleMaster<?> shuffleMaster,
	JobMasterPartitionTracker partitionTracker) throws Exception {

	ExecutionGraph newExecutionGraph = createExecutionGraph(currentJobManagerJobMetricGroup, shuffleMaster, partitionTracker);

	final CheckpointCoordinator checkpointCoordinator = newExecutionGraph.getCheckpointCoordinator();

	if (checkpointCoordinator != null) {
		// check whether we find a valid checkpoint
		if (!checkpointCoordinator.restoreLatestCheckpointedStateToAll(
			new HashSet<>(newExecutionGraph.getAllVertices().values()),
			false)) {

			// check whether we can restore from a savepoint
			tryRestoreExecutionGraphFromSavepoint(newExecutionGraph, jobGraph.getSavepointRestoreSettings());
		}
	}

	return newExecutionGraph;
}
 
Example #3
Source File: FailoverRegionTest.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Let the checkpoint coordinator to receive all acknowledges from given executionVertexes so that to complete the expected checkpoint.
 */
private void acknowledgeAllCheckpoints(CheckpointCoordinator checkpointCoordinator, Iterator<ExecutionVertex> executionVertexes) throws IOException, CheckpointException {
	while (executionVertexes.hasNext()) {
		ExecutionVertex executionVertex = executionVertexes.next();
		for (int index = 0; index < executionVertex.getJobVertex().getParallelism(); index++) {
			JobVertexID jobVertexID = executionVertex.getJobvertexId();
			OperatorStateHandle opStateBackend = CheckpointCoordinatorTest.generatePartitionableStateHandle(jobVertexID, index, 2, 8, false);
			OperatorSubtaskState operatorSubtaskState = new OperatorSubtaskState(opStateBackend, null, null, null);
			TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
			taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID), operatorSubtaskState);

			AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(
				executionVertex.getJobId(),
				executionVertex.getJobVertex().getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(),
				checkpointId,
				new CheckpointMetrics(),
				taskOperatorSubtaskStates);

			checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint, "Unknown location");
		}
	}
}
 
Example #4
Source File: SchedulerBase.java    From flink with Apache License 2.0 6 votes vote down vote up
protected void restoreState(final Set<ExecutionVertexID> vertices, final boolean isGlobalRecovery) throws Exception {
	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	if (checkpointCoordinator == null) {
		return;
	}

	// if there is checkpointed state, reload it into the executions

	// abort pending checkpoints to
	// i) enable new checkpoint triggering without waiting for last checkpoint expired.
	// ii) ensure the EXACTLY_ONCE semantics if needed.
	checkpointCoordinator.abortPendingCheckpoints(
			new CheckpointException(CheckpointFailureReason.JOB_FAILOVER_REGION));

	final Set<ExecutionJobVertex> jobVerticesToRestore = getInvolvedExecutionJobVertices(vertices);
	if (isGlobalRecovery) {
		checkpointCoordinator.restoreLatestCheckpointedStateToAll(jobVerticesToRestore, true);
	} else {
		checkpointCoordinator.restoreLatestCheckpointedStateToSubtasks(jobVerticesToRestore);
	}
}
 
Example #5
Source File: LegacyScheduler.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
public void declineCheckpoint(final DeclineCheckpoint decline) {
	mainThreadExecutor.assertRunningInMainThread();

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	final String taskManagerLocationInfo = retrieveTaskManagerLocation(decline.getTaskExecutionId());

	if (checkpointCoordinator != null) {
		ioExecutor.execute(() -> {
			try {
				checkpointCoordinator.receiveDeclineMessage(decline, taskManagerLocationInfo);
			} catch (Exception e) {
				log.error("Error in CheckpointCoordinator while processing {}", decline, e);
			}
		});
	} else {
		String errorMessage = "Received DeclineCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}
 
Example #6
Source File: SchedulerBase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
public void declineCheckpoint(final DeclineCheckpoint decline) {
	mainThreadExecutor.assertRunningInMainThread();

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	final String taskManagerLocationInfo = retrieveTaskManagerLocation(decline.getTaskExecutionId());

	if (checkpointCoordinator != null) {
		ioExecutor.execute(() -> {
			try {
				checkpointCoordinator.receiveDeclineMessage(decline, taskManagerLocationInfo);
			} catch (Exception e) {
				log.error("Error in CheckpointCoordinator while processing {}", decline, e);
			}
		});
	} else {
		String errorMessage = "Received DeclineCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}
 
Example #7
Source File: LegacyScheduler.java    From flink with Apache License 2.0 6 votes vote down vote up
private ExecutionGraph createAndRestoreExecutionGraph(
		JobManagerJobMetricGroup currentJobManagerJobMetricGroup,
		ShuffleMaster<?> shuffleMaster,
		PartitionTracker partitionTracker) throws Exception {

	ExecutionGraph newExecutionGraph = createExecutionGraph(currentJobManagerJobMetricGroup, shuffleMaster, partitionTracker);

	final CheckpointCoordinator checkpointCoordinator = newExecutionGraph.getCheckpointCoordinator();

	if (checkpointCoordinator != null) {
		// check whether we find a valid checkpoint
		if (!checkpointCoordinator.restoreLatestCheckpointedState(
			newExecutionGraph.getAllVertices(),
			false,
			false)) {

			// check whether we can restore from a savepoint
			tryRestoreExecutionGraphFromSavepoint(newExecutionGraph, jobGraph.getSavepointRestoreSettings());
		}
	}

	return newExecutionGraph;
}
 
Example #8
Source File: SchedulerTestingUtils.java    From flink with Apache License 2.0 6 votes vote down vote up
public static void acknowledgeCurrentCheckpoint(DefaultScheduler scheduler) {
	final CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(scheduler);
	assertEquals("Coordinator has not ", 1, checkpointCoordinator.getNumberOfPendingCheckpoints());

	final PendingCheckpoint pc = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();

	// because of races against the async thread in the coordinator, we need to wait here until the
	// coordinator state is acknowledged. This can be removed once the CheckpointCoordinator is
	// executes all actions in the Scheduler's main thread executor.
	while (pc.getNumberOfNonAcknowledgedOperatorCoordinators() > 0) {
		try {
			Thread.sleep(1);
		} catch (InterruptedException e) {
			Thread.currentThread().interrupt();
			fail("interrupted");
		}
	}

	getAllCurrentExecutionAttempts(scheduler).forEach(
		(attemptId) -> scheduler.acknowledgeCheckpoint(pc.getJobId(), attemptId, pc.getCheckpointId(), new CheckpointMetrics(), null));
}
 
Example #9
Source File: DefaultSchedulerTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void abortPendingCheckpointsWhenRestartingTasks() throws Exception {
	final JobGraph jobGraph = singleNonParallelJobVertexJobGraph();
	enableCheckpointing(jobGraph);

	final CountDownLatch checkpointTriggeredLatch = getCheckpointTriggeredLatch();

	final DefaultScheduler scheduler = createSchedulerAndStartScheduling(jobGraph);

	final ArchivedExecutionVertex onlyExecutionVertex = Iterables.getOnlyElement(scheduler.requestJob().getAllExecutionVertices());
	final ExecutionAttemptID attemptId = onlyExecutionVertex.getCurrentExecutionAttempt().getAttemptId();
	scheduler.updateTaskExecutionState(new TaskExecutionState(jobGraph.getJobID(), attemptId, ExecutionState.RUNNING));

	final CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(scheduler);

	checkpointCoordinator.triggerCheckpoint(false);
	checkpointTriggeredLatch.await();
	assertThat(checkpointCoordinator.getNumberOfPendingCheckpoints(), is(equalTo(1)));

	scheduler.updateTaskExecutionState(new TaskExecutionState(jobGraph.getJobID(), attemptId, ExecutionState.FAILED));
	taskRestartExecutor.triggerScheduledTasks();
	assertThat(checkpointCoordinator.getNumberOfPendingCheckpoints(), is(equalTo(0)));
}
 
Example #10
Source File: JobMaster.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
private ExecutionGraph createAndRestoreExecutionGraph(JobManagerJobMetricGroup currentJobManagerJobMetricGroup) throws Exception {

		ExecutionGraph newExecutionGraph = createExecutionGraph(currentJobManagerJobMetricGroup);

		final CheckpointCoordinator checkpointCoordinator = newExecutionGraph.getCheckpointCoordinator();

		if (checkpointCoordinator != null) {
			// check whether we find a valid checkpoint
			if (!checkpointCoordinator.restoreLatestCheckpointedState(
				newExecutionGraph.getAllVertices(),
				false,
				false)) {

				// check whether we can restore from a savepoint
				tryRestoreExecutionGraphFromSavepoint(newExecutionGraph, jobGraph.getSavepointRestoreSettings());
			}
		}

		return newExecutionGraph;
	}
 
Example #11
Source File: JobMaster.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Override
public void declineCheckpoint(DeclineCheckpoint decline) {
	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();

	if (checkpointCoordinator != null) {
		getRpcService().execute(() -> {
			try {
				checkpointCoordinator.receiveDeclineMessage(decline);
			} catch (Exception e) {
				log.error("Error in CheckpointCoordinator while processing {}", decline, e);
			}
		});
	} else {
		String errorMessage = "Received DeclineCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}
 
Example #12
Source File: SchedulerTestingUtils.java    From flink with Apache License 2.0 5 votes vote down vote up
public static CompletedCheckpoint takeCheckpoint(DefaultScheduler scheduler) throws Exception {
	final CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(scheduler);
	checkpointCoordinator.triggerCheckpoint(false);

	assertEquals("test setup inconsistent", 1, checkpointCoordinator.getNumberOfPendingCheckpoints());
	final PendingCheckpoint checkpoint = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
	final CompletableFuture<CompletedCheckpoint> future = checkpoint.getCompletionFuture();

	acknowledgePendingCheckpoint(scheduler, checkpoint.getCheckpointId());

	CompletedCheckpoint completed = future.getNow(null);
	assertNotNull("checkpoint not complete", completed);
	return completed;
}
 
Example #13
Source File: SchedulerTestingUtils.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void acknowledgePendingCheckpoint(final DefaultScheduler scheduler, final long checkpointId) throws CheckpointException {
	final CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(scheduler);
	final JobID jid = scheduler.getJobId();

	for (ExecutionAttemptID attemptId : getAllCurrentExecutionAttempts(scheduler)) {
		final AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(jid, attemptId, checkpointId);
		checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint, "Unknown location");
	}
}
 
Example #14
Source File: DefaultSchedulerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void failGlobalWhenRestoringStateFails() throws Exception {
	final JobGraph jobGraph = singleNonParallelJobVertexJobGraph();
	final JobVertex onlyJobVertex = getOnlyJobVertex(jobGraph);
	enableCheckpointing(jobGraph);

	final CountDownLatch checkpointTriggeredLatch = getCheckpointTriggeredLatch();

	final DefaultScheduler scheduler = createSchedulerAndStartScheduling(jobGraph);

	final ArchivedExecutionVertex onlyExecutionVertex = Iterables.getOnlyElement(scheduler.requestJob().getAllExecutionVertices());
	final ExecutionAttemptID attemptId = onlyExecutionVertex.getCurrentExecutionAttempt().getAttemptId();
	scheduler.updateTaskExecutionState(new TaskExecutionState(jobGraph.getJobID(), attemptId, ExecutionState.RUNNING));

	final CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(scheduler);

	// register a master hook to fail state restore
	final TestMasterHook masterHook = TestMasterHook.fromId("testHook");
	masterHook.enableFailOnRestore();
	checkpointCoordinator.addMasterHook(masterHook);

	// complete one checkpoint for state restore
	checkpointCoordinator.triggerCheckpoint(false);
	checkpointTriggeredLatch.await();
	final long checkpointId = checkpointCoordinator.getPendingCheckpoints().keySet().iterator().next();
	acknowledgePendingCheckpoint(scheduler, checkpointId);

	scheduler.updateTaskExecutionState(new TaskExecutionState(jobGraph.getJobID(), attemptId, ExecutionState.FAILED));
	taskRestartExecutor.triggerScheduledTasks();
	final List<ExecutionVertexID> deployedExecutionVertices = testExecutionVertexOperations.getDeployedVertices();

	// the first task failover should be skipped on state restore failure
	final ExecutionVertexID executionVertexId = new ExecutionVertexID(onlyJobVertex.getID(), 0);
	assertThat(deployedExecutionVertices, contains(executionVertexId));

	// a global failure should be triggered on state restore failure
	masterHook.disableFailOnRestore();
	taskRestartExecutor.triggerScheduledTasks();
	assertThat(deployedExecutionVertices, contains(executionVertexId, executionVertexId));
}
 
Example #15
Source File: SchedulerBase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public void acknowledgeCheckpoint(final JobID jobID, final ExecutionAttemptID executionAttemptID, final long checkpointId, final CheckpointMetrics checkpointMetrics, final TaskStateSnapshot checkpointState) {
	mainThreadExecutor.assertRunningInMainThread();

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	final AcknowledgeCheckpoint ackMessage = new AcknowledgeCheckpoint(
		jobID,
		executionAttemptID,
		checkpointId,
		checkpointMetrics,
		checkpointState);

	final String taskManagerLocationInfo = retrieveTaskManagerLocation(executionAttemptID);

	if (checkpointCoordinator != null) {
		ioExecutor.execute(() -> {
			try {
				checkpointCoordinator.receiveAcknowledgeMessage(ackMessage, taskManagerLocationInfo);
			} catch (Throwable t) {
				log.warn("Error while processing checkpoint acknowledgement message", t);
			}
		});
	} else {
		String errorMessage = "Received AcknowledgeCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}
 
Example #16
Source File: SchedulerBase.java    From flink with Apache License 2.0 5 votes vote down vote up
private void startCheckpointScheduler(final CheckpointCoordinator checkpointCoordinator) {
	mainThreadExecutor.assertRunningInMainThread();

	if (checkpointCoordinator.isPeriodicCheckpointingConfigured()) {
		try {
			checkpointCoordinator.startCheckpointScheduler();
		} catch (IllegalStateException ignored) {
			// Concurrent shut down of the coordinator
		}
	}
}
 
Example #17
Source File: SchedulerBase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public CompletableFuture<String> triggerSavepoint(final String targetDirectory, final boolean cancelJob) {
	mainThreadExecutor.assertRunningInMainThread();

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	if (checkpointCoordinator == null) {
		throw new IllegalStateException(
			String.format("Job %s is not a streaming job.", jobGraph.getJobID()));
	} else if (targetDirectory == null && !checkpointCoordinator.getCheckpointStorage().hasDefaultSavepointLocation()) {
		log.info("Trying to cancel job {} with savepoint, but no savepoint directory configured.", jobGraph.getJobID());

		throw new IllegalStateException(
			"No savepoint directory configured. You can either specify a directory " +
				"while cancelling via -s :targetDirectory or configure a cluster-wide " +
				"default via key '" + CheckpointingOptions.SAVEPOINT_DIRECTORY.key() + "'.");
	}

	log.info("Triggering {}savepoint for job {}.", cancelJob ? "cancel-with-" : "", jobGraph.getJobID());

	if (cancelJob) {
		checkpointCoordinator.stopCheckpointScheduler();
	}

	return checkpointCoordinator
		.triggerSavepoint(targetDirectory)
		.thenApply(CompletedCheckpoint::getExternalPointer)
		.handleAsync((path, throwable) -> {
			if (throwable != null) {
				if (cancelJob) {
					startCheckpointScheduler(checkpointCoordinator);
				}
				throw new CompletionException(throwable);
			} else if (cancelJob) {
				log.info("Savepoint stored in {}. Now cancelling {}.", path, jobGraph.getJobID());
				cancel();
			}
			return path;
		}, mainThreadExecutor);
}
 
Example #18
Source File: DefaultSchedulerTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void restoreStateWhenRestartingTasks() throws Exception {
	final JobGraph jobGraph = singleNonParallelJobVertexJobGraph();
	enableCheckpointing(jobGraph);

	final CountDownLatch checkpointTriggeredLatch = getCheckpointTriggeredLatch();

	final DefaultScheduler scheduler = createSchedulerAndStartScheduling(jobGraph);

	final ArchivedExecutionVertex onlyExecutionVertex = Iterables.getOnlyElement(scheduler.requestJob().getAllExecutionVertices());
	final ExecutionAttemptID attemptId = onlyExecutionVertex.getCurrentExecutionAttempt().getAttemptId();
	scheduler.updateTaskExecutionState(new TaskExecutionState(jobGraph.getJobID(), attemptId, ExecutionState.RUNNING));

	final CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(scheduler);

	// register a stateful master hook to help verify state restore
	final TestMasterHook masterHook = TestMasterHook.fromId("testHook");
	checkpointCoordinator.addMasterHook(masterHook);

	// complete one checkpoint for state restore
	checkpointCoordinator.triggerCheckpoint(false);
	checkpointTriggeredLatch.await();
	final long checkpointId = checkpointCoordinator.getPendingCheckpoints().keySet().iterator().next();
	acknowledgePendingCheckpoint(scheduler, checkpointId);

	scheduler.updateTaskExecutionState(new TaskExecutionState(jobGraph.getJobID(), attemptId, ExecutionState.FAILED));
	taskRestartExecutor.triggerScheduledTasks();
	assertThat(masterHook.getRestoreCount(), is(equalTo(1)));
}
 
Example #19
Source File: SchedulerBase.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Tries to restore the given {@link ExecutionGraph} from the provided {@link SavepointRestoreSettings}.
 *
 * @param executionGraphToRestore {@link ExecutionGraph} which is supposed to be restored
 * @param savepointRestoreSettings {@link SavepointRestoreSettings} containing information about the savepoint to restore from
 * @throws Exception if the {@link ExecutionGraph} could not be restored
 */
private void tryRestoreExecutionGraphFromSavepoint(ExecutionGraph executionGraphToRestore, SavepointRestoreSettings savepointRestoreSettings) throws Exception {
	if (savepointRestoreSettings.restoreSavepoint()) {
		final CheckpointCoordinator checkpointCoordinator = executionGraphToRestore.getCheckpointCoordinator();
		if (checkpointCoordinator != null) {
			checkpointCoordinator.restoreSavepoint(
				savepointRestoreSettings.getRestorePath(),
				savepointRestoreSettings.allowNonRestoredState(),
				executionGraphToRestore.getAllVertices(),
				userCodeLoader);
		}
	}
}
 
Example #20
Source File: JobMaster.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
public void acknowledgeCheckpoint(
		final JobID jobID,
		final ExecutionAttemptID executionAttemptID,
		final long checkpointId,
		final CheckpointMetrics checkpointMetrics,
		final TaskStateSnapshot checkpointState) {

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	final AcknowledgeCheckpoint ackMessage = new AcknowledgeCheckpoint(
		jobID,
		executionAttemptID,
		checkpointId,
		checkpointMetrics,
		checkpointState);

	if (checkpointCoordinator != null) {
		getRpcService().execute(() -> {
			try {
				checkpointCoordinator.receiveAcknowledgeMessage(ackMessage);
			} catch (Throwable t) {
				log.warn("Error while processing checkpoint acknowledgement message", t);
			}
		});
	} else {
		String errorMessage = "Received AcknowledgeCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}
 
Example #21
Source File: AdaptedRestartPipelinedRegionStrategyNGAbortPendingCheckpointsTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void abortPendingCheckpointsWhenRestartingTasks() throws Exception {
	final JobGraph jobGraph = createStreamingJobGraph();
	final ExecutionGraph executionGraph = createExecutionGraph(jobGraph);

	final Iterator<ExecutionVertex> vertexIterator = executionGraph.getAllExecutionVertices().iterator();
	final ExecutionVertex firstExecutionVertex = vertexIterator.next();

	setTasksRunning(executionGraph, firstExecutionVertex, vertexIterator.next());

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	checkState(checkpointCoordinator != null);

	checkpointCoordinator.triggerCheckpoint(System.currentTimeMillis(),  false);
	assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
	long checkpointId = checkpointCoordinator.getPendingCheckpoints().keySet().iterator().next();

	AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(
		jobGraph.getJobID(),
		firstExecutionVertex.getCurrentExecutionAttempt().getAttemptId(),
		checkpointId);

	// let the first vertex acknowledge the checkpoint, and fail it afterwards
	// the failover strategy should then cancel all pending checkpoints on restart
	checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint, "Unknown location");
	assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());

	failVertex(firstExecutionVertex);
	assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
	manualMainThreadExecutor.triggerScheduledTasks();

	assertNoPendingCheckpoints(checkpointCoordinator);
}
 
Example #22
Source File: JobMaster.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
public CompletableFuture<String> triggerSavepoint(
		@Nullable final String targetDirectory,
		final boolean cancelJob,
		final Time timeout) {

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	if (checkpointCoordinator == null) {
		return FutureUtils.completedExceptionally(new IllegalStateException(
			String.format("Job %s is not a streaming job.", jobGraph.getJobID())));
	} else if (targetDirectory == null && !checkpointCoordinator.getCheckpointStorage().hasDefaultSavepointLocation()) {
		log.info("Trying to cancel job {} with savepoint, but no savepoint directory configured.", jobGraph.getJobID());

		return FutureUtils.completedExceptionally(new IllegalStateException(
			"No savepoint directory configured. You can either specify a directory " +
				"while cancelling via -s :targetDirectory or configure a cluster-wide " +
				"default via key '" + CheckpointingOptions.SAVEPOINT_DIRECTORY.key() + "'."));
	}

	if (cancelJob) {
		checkpointCoordinator.stopCheckpointScheduler();
	}
	return checkpointCoordinator
		.triggerSavepoint(System.currentTimeMillis(), targetDirectory)
		.thenApply(CompletedCheckpoint::getExternalPointer)
		.handleAsync((path, throwable) -> {
			if (throwable != null) {
				if (cancelJob) {
					startCheckpointScheduler(checkpointCoordinator);
				}
				throw new CompletionException(throwable);
			} else if (cancelJob) {
				log.info("Savepoint stored in {}. Now cancelling {}.", path, jobGraph.getJobID());
				cancel(timeout);
			}
			return path;
		}, getMainThreadExecutor());
}
 
Example #23
Source File: JobMaster.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private void startCheckpointScheduler(final CheckpointCoordinator checkpointCoordinator) {
	if (checkpointCoordinator.isPeriodicCheckpointingConfigured()) {
		try {
			checkpointCoordinator.startCheckpointScheduler();
		} catch (IllegalStateException ignored) {
			// Concurrent shut down of the coordinator
		}
	}
}
 
Example #24
Source File: JobMaster.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Tries to restore the given {@link ExecutionGraph} from the provided {@link SavepointRestoreSettings}.
 *
 * @param executionGraphToRestore {@link ExecutionGraph} which is supposed to be restored
 * @param savepointRestoreSettings {@link SavepointRestoreSettings} containing information about the savepoint to restore from
 * @throws Exception if the {@link ExecutionGraph} could not be restored
 */
private void tryRestoreExecutionGraphFromSavepoint(ExecutionGraph executionGraphToRestore, SavepointRestoreSettings savepointRestoreSettings) throws Exception {
	if (savepointRestoreSettings.restoreSavepoint()) {
		final CheckpointCoordinator checkpointCoordinator = executionGraphToRestore.getCheckpointCoordinator();
		if (checkpointCoordinator != null) {
			checkpointCoordinator.restoreSavepoint(
				savepointRestoreSettings.getRestorePath(),
				savepointRestoreSettings.allowNonRestoredState(),
				executionGraphToRestore.getAllVertices(),
				userCodeLoader);
		}
	}
}
 
Example #25
Source File: ExecutionGraph.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private void onTerminalState(JobStatus status) {
	try {
		CheckpointCoordinator coord = this.checkpointCoordinator;
		this.checkpointCoordinator = null;
		if (coord != null) {
			coord.shutdown(status);
		}
	}
	catch (Exception e) {
		LOG.error("Error while cleaning up after execution", e);
	}
	finally {
		terminationFuture.complete(status);
	}
}
 
Example #26
Source File: LegacyScheduler.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Tries to restore the given {@link ExecutionGraph} from the provided {@link SavepointRestoreSettings}.
 *
 * @param executionGraphToRestore {@link ExecutionGraph} which is supposed to be restored
 * @param savepointRestoreSettings {@link SavepointRestoreSettings} containing information about the savepoint to restore from
 * @throws Exception if the {@link ExecutionGraph} could not be restored
 */
private void tryRestoreExecutionGraphFromSavepoint(ExecutionGraph executionGraphToRestore, SavepointRestoreSettings savepointRestoreSettings) throws Exception {
	if (savepointRestoreSettings.restoreSavepoint()) {
		final CheckpointCoordinator checkpointCoordinator = executionGraphToRestore.getCheckpointCoordinator();
		if (checkpointCoordinator != null) {
			checkpointCoordinator.restoreSavepoint(
				savepointRestoreSettings.getRestorePath(),
				savepointRestoreSettings.allowNonRestoredState(),
				executionGraphToRestore.getAllVertices(),
				userCodeLoader);
		}
	}
}
 
Example #27
Source File: LegacyScheduler.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public CompletableFuture<String> triggerSavepoint(final String targetDirectory, final boolean cancelJob) {
	mainThreadExecutor.assertRunningInMainThread();

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	if (checkpointCoordinator == null) {
		throw new IllegalStateException(
			String.format("Job %s is not a streaming job.", jobGraph.getJobID()));
	} else if (targetDirectory == null && !checkpointCoordinator.getCheckpointStorage().hasDefaultSavepointLocation()) {
		log.info("Trying to cancel job {} with savepoint, but no savepoint directory configured.", jobGraph.getJobID());

		throw new IllegalStateException(
			"No savepoint directory configured. You can either specify a directory " +
				"while cancelling via -s :targetDirectory or configure a cluster-wide " +
				"default via key '" + CheckpointingOptions.SAVEPOINT_DIRECTORY.key() + "'.");
	}

	if (cancelJob) {
		checkpointCoordinator.stopCheckpointScheduler();
	}

	return checkpointCoordinator
		.triggerSavepoint(System.currentTimeMillis(), targetDirectory)
		.thenApply(CompletedCheckpoint::getExternalPointer)
		.handleAsync((path, throwable) -> {
			if (throwable != null) {
				if (cancelJob) {
					startCheckpointScheduler(checkpointCoordinator);
				}
				throw new CompletionException(throwable);
			} else if (cancelJob) {
				log.info("Savepoint stored in {}. Now cancelling {}.", path, jobGraph.getJobID());
				cancel();
			}
			return path;
		}, mainThreadExecutor);
}
 
Example #28
Source File: LegacyScheduler.java    From flink with Apache License 2.0 5 votes vote down vote up
private void startCheckpointScheduler(final CheckpointCoordinator checkpointCoordinator) {
	mainThreadExecutor.assertRunningInMainThread();

	if (checkpointCoordinator.isPeriodicCheckpointingConfigured()) {
		try {
			checkpointCoordinator.startCheckpointScheduler();
		} catch (IllegalStateException ignored) {
			// Concurrent shut down of the coordinator
		}
	}
}
 
Example #29
Source File: LegacyScheduler.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public void acknowledgeCheckpoint(final JobID jobID, final ExecutionAttemptID executionAttemptID, final long checkpointId, final CheckpointMetrics checkpointMetrics, final TaskStateSnapshot checkpointState) {
	mainThreadExecutor.assertRunningInMainThread();

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	final AcknowledgeCheckpoint ackMessage = new AcknowledgeCheckpoint(
		jobID,
		executionAttemptID,
		checkpointId,
		checkpointMetrics,
		checkpointState);

	final String taskManagerLocationInfo = retrieveTaskManagerLocation(executionAttemptID);

	if (checkpointCoordinator != null) {
		ioExecutor.execute(() -> {
			try {
				checkpointCoordinator.receiveAcknowledgeMessage(ackMessage, taskManagerLocationInfo);
			} catch (Throwable t) {
				log.warn("Error while processing checkpoint acknowledgement message", t);
			}
		});
	} else {
		String errorMessage = "Received AcknowledgeCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}
 
Example #30
Source File: ExecutionGraph.java    From flink with Apache License 2.0 5 votes vote down vote up
private void onTerminalState(JobStatus status) {
	try {
		CheckpointCoordinator coord = this.checkpointCoordinator;
		this.checkpointCoordinator = null;
		if (coord != null) {
			coord.shutdown(status);
		}
	}
	catch (Exception e) {
		LOG.error("Error while cleaning up after execution", e);
	}
	finally {
		terminationFuture.complete(status);
	}
}