org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint Java Examples

The following examples show how to use org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: FailoverRegionTest.java From flink with Apache License 2.0

6 votes

/**
 * Let the checkpoint coordinator to receive all acknowledges from given executionVertexes so that to complete the expected checkpoint.
 */
private void acknowledgeAllCheckpoints(CheckpointCoordinator checkpointCoordinator, Iterator<ExecutionVertex> executionVertexes) throws IOException, CheckpointException {
	while (executionVertexes.hasNext()) {
		ExecutionVertex executionVertex = executionVertexes.next();
		for (int index = 0; index < executionVertex.getJobVertex().getParallelism(); index++) {
			JobVertexID jobVertexID = executionVertex.getJobvertexId();
			OperatorStateHandle opStateBackend = CheckpointCoordinatorTest.generatePartitionableStateHandle(jobVertexID, index, 2, 8, false);
			OperatorSubtaskState operatorSubtaskState = new OperatorSubtaskState(opStateBackend, null, null, null);
			TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
			taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID), operatorSubtaskState);

			AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(
				executionVertex.getJobId(),
				executionVertex.getJobVertex().getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(),
				checkpointId,
				new CheckpointMetrics(),
				taskOperatorSubtaskStates);

			checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint, "Unknown location");
		}
	}
}

Example #2

Source File: ActorGatewayCheckpointResponder.java From Flink-CEPplus with Apache License 2.0

5 votes

@Override
public void acknowledgeCheckpoint(
		JobID jobID,
		ExecutionAttemptID executionAttemptID,
		long checkpointId,
		CheckpointMetrics checkpointMetrics,
		TaskStateSnapshot checkpointStateHandles) {

	AcknowledgeCheckpoint message = new AcknowledgeCheckpoint(
			jobID, executionAttemptID, checkpointId, checkpointMetrics,
			checkpointStateHandles);

	actorGateway.tell(message);
}

Example #3

Source File: SchedulerTestingUtils.java From flink with Apache License 2.0

5 votes

public static void acknowledgePendingCheckpoint(final DefaultScheduler scheduler, final long checkpointId) throws CheckpointException {
	final CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(scheduler);
	final JobID jid = scheduler.getJobId();

	for (ExecutionAttemptID attemptId : getAllCurrentExecutionAttempts(scheduler)) {
		final AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(jid, attemptId, checkpointId);
		checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint, "Unknown location");
	}
}

Example #4

Source File: CheckpointMessagesTest.java From flink with Apache License 2.0

5 votes

@Test
public void testConfirmTaskCheckpointed() {
	final Random rnd = new Random();
	try {
		AcknowledgeCheckpoint noState = new AcknowledgeCheckpoint(
				new JobID(), new ExecutionAttemptID(), 569345L);

		KeyGroupRange keyGroupRange = KeyGroupRange.of(42, 42);

		TaskStateSnapshot checkpointStateHandles = new TaskStateSnapshot();
		checkpointStateHandles.putSubtaskStateByOperatorID(
			new OperatorID(),
			new OperatorSubtaskState(
				CheckpointCoordinatorTestingUtils.generatePartitionableStateHandle(new JobVertexID(), 0, 2, 8, false),
				null,
				CheckpointCoordinatorTestingUtils.generateKeyGroupState(keyGroupRange, Collections.singletonList(new MyHandle())),
				null,
				singleton(createNewInputChannelStateHandle(10, rnd)),
				singleton(createNewResultSubpartitionStateHandle(10, rnd))
			)
		);

		AcknowledgeCheckpoint withState = new AcknowledgeCheckpoint(
				new JobID(),
				new ExecutionAttemptID(),
				87658976143L,
				new CheckpointMetrics(),
				checkpointStateHandles);

		testSerializabilityEqualsHashCode(noState);
		testSerializabilityEqualsHashCode(withState);
	} catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example #5

Source File: SchedulerBase.java From flink with Apache License 2.0

5 votes

@Override
public void acknowledgeCheckpoint(final JobID jobID, final ExecutionAttemptID executionAttemptID, final long checkpointId, final CheckpointMetrics checkpointMetrics, final TaskStateSnapshot checkpointState) {
	mainThreadExecutor.assertRunningInMainThread();

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	final AcknowledgeCheckpoint ackMessage = new AcknowledgeCheckpoint(
		jobID,
		executionAttemptID,
		checkpointId,
		checkpointMetrics,
		checkpointState);

	final String taskManagerLocationInfo = retrieveTaskManagerLocation(executionAttemptID);

	if (checkpointCoordinator != null) {
		ioExecutor.execute(() -> {
			try {
				checkpointCoordinator.receiveAcknowledgeMessage(ackMessage, taskManagerLocationInfo);
			} catch (Throwable t) {
				log.warn("Error while processing checkpoint acknowledgement message", t);
			}
		});
	} else {
		String errorMessage = "Received AcknowledgeCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}

Example #6

Source File: AdaptedRestartPipelinedRegionStrategyNGAbortPendingCheckpointsTest.java From flink with Apache License 2.0

5 votes

@Test
public void abortPendingCheckpointsWhenRestartingTasks() throws Exception {
	final JobGraph jobGraph = createStreamingJobGraph();
	final ExecutionGraph executionGraph = createExecutionGraph(jobGraph);

	final Iterator<ExecutionVertex> vertexIterator = executionGraph.getAllExecutionVertices().iterator();
	final ExecutionVertex firstExecutionVertex = vertexIterator.next();

	setTasksRunning(executionGraph, firstExecutionVertex, vertexIterator.next());

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	checkState(checkpointCoordinator != null);

	checkpointCoordinator.triggerCheckpoint(System.currentTimeMillis(),  false);
	assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
	long checkpointId = checkpointCoordinator.getPendingCheckpoints().keySet().iterator().next();

	AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(
		jobGraph.getJobID(),
		firstExecutionVertex.getCurrentExecutionAttempt().getAttemptId(),
		checkpointId);

	// let the first vertex acknowledge the checkpoint, and fail it afterwards
	// the failover strategy should then cancel all pending checkpoints on restart
	checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint, "Unknown location");
	assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());

	failVertex(firstExecutionVertex);
	assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
	manualMainThreadExecutor.triggerScheduledTasks();

	assertNoPendingCheckpoints(checkpointCoordinator);
}

Example #7

Source File: CheckpointMessagesTest.java From flink with Apache License 2.0

5 votes

@Test
public void testConfirmTaskCheckpointed() {
	try {
		AcknowledgeCheckpoint noState = new AcknowledgeCheckpoint(
				new JobID(), new ExecutionAttemptID(), 569345L);

		KeyGroupRange keyGroupRange = KeyGroupRange.of(42,42);

		TaskStateSnapshot checkpointStateHandles = new TaskStateSnapshot();
		checkpointStateHandles.putSubtaskStateByOperatorID(
			new OperatorID(),
			new OperatorSubtaskState(
				CheckpointCoordinatorTest.generatePartitionableStateHandle(new JobVertexID(), 0, 2, 8, false),
				null,
				CheckpointCoordinatorTest.generateKeyGroupState(keyGroupRange, Collections.singletonList(new MyHandle())),
				null
			)
		);

		AcknowledgeCheckpoint withState = new AcknowledgeCheckpoint(
				new JobID(),
				new ExecutionAttemptID(),
				87658976143L,
				new CheckpointMetrics(),
				checkpointStateHandles);

		testSerializabilityEqualsHashCode(noState);
		testSerializabilityEqualsHashCode(withState);
	} catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example #8

Source File: LegacyScheduler.java From flink with Apache License 2.0

5 votes

@Override
public void acknowledgeCheckpoint(final JobID jobID, final ExecutionAttemptID executionAttemptID, final long checkpointId, final CheckpointMetrics checkpointMetrics, final TaskStateSnapshot checkpointState) {
	mainThreadExecutor.assertRunningInMainThread();

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	final AcknowledgeCheckpoint ackMessage = new AcknowledgeCheckpoint(
		jobID,
		executionAttemptID,
		checkpointId,
		checkpointMetrics,
		checkpointState);

	final String taskManagerLocationInfo = retrieveTaskManagerLocation(executionAttemptID);

	if (checkpointCoordinator != null) {
		ioExecutor.execute(() -> {
			try {
				checkpointCoordinator.receiveAcknowledgeMessage(ackMessage, taskManagerLocationInfo);
			} catch (Throwable t) {
				log.warn("Error while processing checkpoint acknowledgement message", t);
			}
		});
	} else {
		String errorMessage = "Received AcknowledgeCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}

Example #9

Source File: CheckpointMessagesTest.java From Flink-CEPplus with Apache License 2.0

5 votes

@Test
public void testConfirmTaskCheckpointed() {
	try {
		AcknowledgeCheckpoint noState = new AcknowledgeCheckpoint(
				new JobID(), new ExecutionAttemptID(), 569345L);

		KeyGroupRange keyGroupRange = KeyGroupRange.of(42,42);

		TaskStateSnapshot checkpointStateHandles = new TaskStateSnapshot();
		checkpointStateHandles.putSubtaskStateByOperatorID(
			new OperatorID(),
			new OperatorSubtaskState(
				CheckpointCoordinatorTest.generatePartitionableStateHandle(new JobVertexID(), 0, 2, 8, false),
				null,
				CheckpointCoordinatorTest.generateKeyGroupState(keyGroupRange, Collections.singletonList(new MyHandle())),
				null
			)
		);

		AcknowledgeCheckpoint withState = new AcknowledgeCheckpoint(
				new JobID(),
				new ExecutionAttemptID(),
				87658976143L,
				new CheckpointMetrics(),
				checkpointStateHandles);

		testSerializabilityEqualsHashCode(noState);
		testSerializabilityEqualsHashCode(withState);
	} catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example #10

Source File: JobMaster.java From Flink-CEPplus with Apache License 2.0

5 votes

@Override
public void acknowledgeCheckpoint(
		final JobID jobID,
		final ExecutionAttemptID executionAttemptID,
		final long checkpointId,
		final CheckpointMetrics checkpointMetrics,
		final TaskStateSnapshot checkpointState) {

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	final AcknowledgeCheckpoint ackMessage = new AcknowledgeCheckpoint(
		jobID,
		executionAttemptID,
		checkpointId,
		checkpointMetrics,
		checkpointState);

	if (checkpointCoordinator != null) {
		getRpcService().execute(() -> {
			try {
				checkpointCoordinator.receiveAcknowledgeMessage(ackMessage);
			} catch (Throwable t) {
				log.warn("Error while processing checkpoint acknowledgement message", t);
			}
		});
	} else {
		String errorMessage = "Received AcknowledgeCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}

Example #11

Source File: CheckpointCoordinatorTest.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Triggers a savepoint and two checkpoints. The second checkpoint completes
 * and subsumes the first checkpoint, but not the first savepoint. Then we
 * trigger another checkpoint and savepoint. The 2nd savepoint completes and
 * subsumes the last checkpoint, but not the first savepoint.
 */
@Test
public void testSavepointsAreNotSubsumed() throws Exception {
	final JobID jid = new JobID();
	final long timestamp = System.currentTimeMillis();

	// create some mock Execution vertices that receive the checkpoint trigger messages
	final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
	final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
	ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
	ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);

	StandaloneCheckpointIDCounter counter = new StandaloneCheckpointIDCounter();

	// set up the coordinator and validate the initial state
	CheckpointCoordinator coord = new CheckpointCoordinator(
		jid,
		600000,
		600000,
		0,
		Integer.MAX_VALUE,
		CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
		new ExecutionVertex[] { vertex1, vertex2 },
		new ExecutionVertex[] { vertex1, vertex2 },
		new ExecutionVertex[] { vertex1, vertex2 },
		counter,
		new StandaloneCompletedCheckpointStore(10),
		new MemoryStateBackend(),
		Executors.directExecutor(),
		SharedStateRegistry.DEFAULT_FACTORY);

	String savepointDir = tmpFolder.newFolder().getAbsolutePath();

	// Trigger savepoint and checkpoint
	CompletableFuture<CompletedCheckpoint> savepointFuture1 = coord.triggerSavepoint(timestamp, savepointDir);
	long savepointId1 = counter.getLast();
	assertEquals(1, coord.getNumberOfPendingCheckpoints());

	assertTrue(coord.triggerCheckpoint(timestamp + 1, false));
	assertEquals(2, coord.getNumberOfPendingCheckpoints());

	assertTrue(coord.triggerCheckpoint(timestamp + 2, false));
	long checkpointId2 = counter.getLast();
	assertEquals(3, coord.getNumberOfPendingCheckpoints());

	// 2nd checkpoint should subsume the 1st checkpoint, but not the savepoint
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId2));
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId2));

	assertEquals(1, coord.getNumberOfPendingCheckpoints());
	assertEquals(1, coord.getNumberOfRetainedSuccessfulCheckpoints());

	assertFalse(coord.getPendingCheckpoints().get(savepointId1).isDiscarded());
	assertFalse(savepointFuture1.isDone());

	assertTrue(coord.triggerCheckpoint(timestamp + 3, false));
	assertEquals(2, coord.getNumberOfPendingCheckpoints());

	CompletableFuture<CompletedCheckpoint> savepointFuture2 = coord.triggerSavepoint(timestamp + 4, savepointDir);
	long savepointId2 = counter.getLast();
	assertEquals(3, coord.getNumberOfPendingCheckpoints());

	// 2nd savepoint should subsume the last checkpoint, but not the 1st savepoint
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, savepointId2));
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, savepointId2));

	assertEquals(1, coord.getNumberOfPendingCheckpoints());
	assertEquals(2, coord.getNumberOfRetainedSuccessfulCheckpoints());
	assertFalse(coord.getPendingCheckpoints().get(savepointId1).isDiscarded());

	assertFalse(savepointFuture1.isDone());
	assertTrue(savepointFuture2.isDone());

	// Ack first savepoint
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, savepointId1));
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, savepointId1));

	assertEquals(0, coord.getNumberOfPendingCheckpoints());
	assertEquals(3, coord.getNumberOfRetainedSuccessfulCheckpoints());
	assertTrue(savepointFuture1.isDone());
}

Example #12

Source File: CheckpointCoordinator.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Receives an AcknowledgeCheckpoint message and returns whether the
 * message was associated with a pending checkpoint.
 *
 * @param message Checkpoint ack from the task manager
 *
 * @return Flag indicating whether the ack'd checkpoint was associated
 * with a pending checkpoint.
 *
 * @throws CheckpointException If the checkpoint cannot be added to the completed checkpoint store.
 */
public boolean receiveAcknowledgeMessage(AcknowledgeCheckpoint message) throws CheckpointException {
	if (shutdown || message == null) {
		return false;
	}

	if (!job.equals(message.getJob())) {
		LOG.error("Received wrong AcknowledgeCheckpoint message for job {}: {}", job, message);
		return false;
	}

	final long checkpointId = message.getCheckpointId();

	synchronized (lock) {
		// we need to check inside the lock for being shutdown as well, otherwise we
		// get races and invalid error log messages
		if (shutdown) {
			return false;
		}

		final PendingCheckpoint checkpoint = pendingCheckpoints.get(checkpointId);

		if (checkpoint != null && !checkpoint.isDiscarded()) {

			switch (checkpoint.acknowledgeTask(message.getTaskExecutionId(), message.getSubtaskState(), message.getCheckpointMetrics())) {
				case SUCCESS:
					LOG.debug("Received acknowledge message for checkpoint {} from task {} of job {}.",
						checkpointId, message.getTaskExecutionId(), message.getJob());

					if (checkpoint.isFullyAcknowledged()) {
						completePendingCheckpoint(checkpoint);
					}
					break;
				case DUPLICATE:
					LOG.debug("Received a duplicate acknowledge message for checkpoint {}, task {}, job {}.",
						message.getCheckpointId(), message.getTaskExecutionId(), message.getJob());
					break;
				case UNKNOWN:
					LOG.warn("Could not acknowledge the checkpoint {} for task {} of job {}, " +
							"because the task's execution attempt id was unknown. Discarding " +
							"the state handle to avoid lingering state.", message.getCheckpointId(),
						message.getTaskExecutionId(), message.getJob());

					discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());

					break;
				case DISCARDED:
					LOG.warn("Could not acknowledge the checkpoint {} for task {} of job {}, " +
						"because the pending checkpoint had been discarded. Discarding the " +
							"state handle tp avoid lingering state.",
						message.getCheckpointId(), message.getTaskExecutionId(), message.getJob());

					discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());
			}

			return true;
		}
		else if (checkpoint != null) {
			// this should not happen
			throw new IllegalStateException(
					"Received message for discarded but non-removed checkpoint " + checkpointId);
		}
		else {
			boolean wasPendingCheckpoint;

			// message is for an unknown checkpoint, or comes too late (checkpoint disposed)
			if (recentPendingCheckpoints.contains(checkpointId)) {
				wasPendingCheckpoint = true;
				LOG.warn("Received late message for now expired checkpoint attempt {} from " +
					"{} of job {}.", checkpointId, message.getTaskExecutionId(), message.getJob());
			}
			else {
				LOG.debug("Received message for an unknown checkpoint {} from {} of job {}.",
					checkpointId, message.getTaskExecutionId(), message.getJob());
				wasPendingCheckpoint = false;
			}

			// try to discard the state so that we don't have lingering state lying around
			discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());

			return wasPendingCheckpoint;
		}
	}
}

Example #13

Source File: CheckpointCoordinatorMasterHooksTest.java From flink with Apache License 2.0

4 votes

@Test
public void testHooksAreCalledOnTrigger() throws Exception {
	final String id1 = "id1";
	final String id2 = "id2";

	final String state1 = "the-test-string-state";
	final byte[] state1serialized = new StringSerializer().serialize(state1);

	final long state2 = 987654321L;
	final byte[] state2serialized = new LongSerializer().serialize(state2);

	final MasterTriggerRestoreHook<String> statefulHook1 = mockGeneric(MasterTriggerRestoreHook.class);
	when(statefulHook1.getIdentifier()).thenReturn(id1);
	when(statefulHook1.createCheckpointDataSerializer()).thenReturn(new StringSerializer());
	when(statefulHook1.triggerCheckpoint(anyLong(), anyLong(), any(Executor.class)))
			.thenReturn(CompletableFuture.completedFuture(state1));

	final MasterTriggerRestoreHook<Long> statefulHook2 = mockGeneric(MasterTriggerRestoreHook.class);
	when(statefulHook2.getIdentifier()).thenReturn(id2);
	when(statefulHook2.createCheckpointDataSerializer()).thenReturn(new LongSerializer());
	when(statefulHook2.triggerCheckpoint(anyLong(), anyLong(), any(Executor.class)))
			.thenReturn(CompletableFuture.completedFuture(state2));

	final MasterTriggerRestoreHook<Void> statelessHook = mockGeneric(MasterTriggerRestoreHook.class);
	when(statelessHook.getIdentifier()).thenReturn("some-id");

	// create the checkpoint coordinator
	final JobID jid = new JobID();
	final ExecutionAttemptID execId = new ExecutionAttemptID();
	final ExecutionVertex ackVertex = mockExecutionVertex(execId);
	final ManuallyTriggeredScheduledExecutor manuallyTriggeredScheduledExecutor =
		new ManuallyTriggeredScheduledExecutor();
	final CheckpointCoordinator cc = instantiateCheckpointCoordinator(
		jid, manuallyTriggeredScheduledExecutor, ackVertex);

	cc.addMasterHook(statefulHook1);
	cc.addMasterHook(statelessHook);
	cc.addMasterHook(statefulHook2);

	// trigger a checkpoint
	final CompletableFuture<CompletedCheckpoint> checkpointFuture = cc.triggerCheckpoint(false);
	manuallyTriggeredScheduledExecutor.triggerAll();
	assertFalse(checkpointFuture.isCompletedExceptionally());
	assertEquals(1, cc.getNumberOfPendingCheckpoints());

	verify(statefulHook1, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));
	verify(statefulHook2, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));
	verify(statelessHook, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));

	final long checkpointId = cc.getPendingCheckpoints().values().iterator().next().getCheckpointId();
	cc.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, execId, checkpointId), "Unknown location");
	assertEquals(0, cc.getNumberOfPendingCheckpoints());

	assertEquals(1, cc.getNumberOfRetainedSuccessfulCheckpoints());
	final CompletedCheckpoint chk = cc.getCheckpointStore().getLatestCheckpoint(false);

	final Collection<MasterState> masterStates = chk.getMasterHookStates();
	assertEquals(2, masterStates.size());

	for (MasterState ms : masterStates) {
		if (ms.name().equals(id1)) {
			assertArrayEquals(state1serialized, ms.bytes());
			assertEquals(StringSerializer.VERSION, ms.version());
		}
		else if (ms.name().equals(id2)) {
			assertArrayEquals(state2serialized, ms.bytes());
			assertEquals(LongSerializer.VERSION, ms.version());
		}
		else {
			fail("unrecognized state name: " + ms.name());
		}
	}
}

Example #14

Source File: CheckpointCoordinatorFailureTest.java From flink with Apache License 2.0

4 votes

/**
 * Tests that a failure while storing a completed checkpoint in the completed checkpoint store
 * will properly fail the originating pending checkpoint and clean upt the completed checkpoint.
 */
@Test
public void testFailingCompletedCheckpointStoreAdd() throws Exception {
	JobID jid = new JobID();

	final ManuallyTriggeredScheduledExecutor manuallyTriggeredScheduledExecutor =
		new ManuallyTriggeredScheduledExecutor();

	final ExecutionAttemptID executionAttemptId = new ExecutionAttemptID();
	final ExecutionVertex vertex = CheckpointCoordinatorTestingUtils.mockExecutionVertex(executionAttemptId);

	// set up the coordinator and validate the initial state
	CheckpointCoordinator coord =
		new CheckpointCoordinatorBuilder()
			.setJobId(jid)
			.setTasks(new ExecutionVertex[] { vertex })
			.setCompletedCheckpointStore(new FailingCompletedCheckpointStore())
			.setTimer(manuallyTriggeredScheduledExecutor)
			.build();

	coord.triggerCheckpoint(false);

	manuallyTriggeredScheduledExecutor.triggerAll();

	assertEquals(1, coord.getNumberOfPendingCheckpoints());

	PendingCheckpoint pendingCheckpoint = coord.getPendingCheckpoints().values().iterator().next();

	assertFalse(pendingCheckpoint.isDiscarded());

	final long checkpointId = coord.getPendingCheckpoints().keySet().iterator().next();

	KeyedStateHandle managedKeyedHandle = mock(KeyedStateHandle.class);
	KeyedStateHandle rawKeyedHandle = mock(KeyedStateHandle.class);
	OperatorStateHandle managedOpHandle = mock(OperatorStreamStateHandle.class);
	OperatorStateHandle rawOpHandle = mock(OperatorStreamStateHandle.class);
	InputChannelStateHandle inputChannelStateHandle = new InputChannelStateHandle(new InputChannelInfo(0, 1), mock(StreamStateHandle.class), Collections.singletonList(1L));
	ResultSubpartitionStateHandle resultSubpartitionStateHandle = new ResultSubpartitionStateHandle(new ResultSubpartitionInfo(0, 1), mock(StreamStateHandle.class), Collections.singletonList(1L));

	final OperatorSubtaskState operatorSubtaskState = spy(new OperatorSubtaskState(
		managedOpHandle,
		rawOpHandle,
		managedKeyedHandle,
		rawKeyedHandle,
		StateObjectCollection.singleton(inputChannelStateHandle),
		StateObjectCollection.singleton(resultSubpartitionStateHandle)));

	TaskStateSnapshot subtaskState = spy(new TaskStateSnapshot());
	subtaskState.putSubtaskStateByOperatorID(new OperatorID(), operatorSubtaskState);

	when(subtaskState.getSubtaskStateByOperatorID(OperatorID.fromJobVertexID(vertex.getJobvertexId()))).thenReturn(operatorSubtaskState);

	AcknowledgeCheckpoint acknowledgeMessage = new AcknowledgeCheckpoint(jid, executionAttemptId, checkpointId, new CheckpointMetrics(), subtaskState);

	try {
		coord.receiveAcknowledgeMessage(acknowledgeMessage, "Unknown location");
		fail("Expected a checkpoint exception because the completed checkpoint store could not " +
			"store the completed checkpoint.");
	} catch (CheckpointException e) {
		// ignore because we expected this exception
	}

	// make sure that the pending checkpoint has been discarded after we could not complete it
	assertTrue(pendingCheckpoint.isDiscarded());

	// make sure that the subtask state has been discarded after we could not complete it.
	verify(operatorSubtaskState).discardState();
	verify(operatorSubtaskState.getManagedOperatorState().iterator().next()).discardState();
	verify(operatorSubtaskState.getRawOperatorState().iterator().next()).discardState();
	verify(operatorSubtaskState.getManagedKeyedState().iterator().next()).discardState();
	verify(operatorSubtaskState.getRawKeyedState().iterator().next()).discardState();
	verify(operatorSubtaskState.getInputChannelState().iterator().next().getDelegate()).discardState();
	verify(operatorSubtaskState.getResultSubpartitionState().iterator().next().getDelegate()).discardState();
}

Example #15

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

private void performIncrementalCheckpoint(
	JobID jid,
	CheckpointCoordinator coord,
	ExecutionJobVertex jobVertex1,
	List<KeyGroupRange> keyGroupPartitions1,
	int cpSequenceNumber) throws Exception {

	// trigger the checkpoint
	coord.triggerCheckpoint(false);
	manuallyTriggeredScheduledExecutor.triggerAll();

	assertEquals(1, coord.getPendingCheckpoints().size());
	long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());

	for (int index = 0; index < jobVertex1.getParallelism(); index++) {

		KeyGroupRange keyGroupRange = keyGroupPartitions1.get(index);

		Map<StateHandleID, StreamStateHandle> privateState = new HashMap<>();
		privateState.put(
			new StateHandleID("private-1"),
			spy(new ByteStreamStateHandle("private-1", new byte[]{'p'})));

		Map<StateHandleID, StreamStateHandle> sharedState = new HashMap<>();

		// let all but the first CP overlap by one shared state.
		if (cpSequenceNumber > 0) {
			sharedState.put(
				new StateHandleID("shared-" + (cpSequenceNumber - 1)),
				spy(new PlaceholderStreamStateHandle()));
		}

		sharedState.put(
			new StateHandleID("shared-" + cpSequenceNumber),
			spy(new ByteStreamStateHandle("shared-" + cpSequenceNumber + "-" + keyGroupRange, new byte[]{'s'})));

		IncrementalRemoteKeyedStateHandle managedState =
			spy(new IncrementalRemoteKeyedStateHandle(
				new UUID(42L, 42L),
				keyGroupRange,
				checkpointId,
				sharedState,
				privateState,
				spy(new ByteStreamStateHandle("meta", new byte[]{'m'}))));

		OperatorSubtaskState operatorSubtaskState =
			spy(new OperatorSubtaskState(
				StateObjectCollection.empty(),
				StateObjectCollection.empty(),
				StateObjectCollection.singleton(managedState),
				StateObjectCollection.empty()));

		Map<OperatorID, OperatorSubtaskState> opStates = new HashMap<>();

		opStates.put(jobVertex1.getOperatorIDs().get(0).getGeneratedOperatorID(), operatorSubtaskState);

		TaskStateSnapshot taskStateSnapshot = new TaskStateSnapshot(opStates);

		AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(
			jid,
			jobVertex1.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(),
			checkpointId,
			new CheckpointMetrics(),
			taskStateSnapshot);

		coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
	}
}

Example #16

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

@Test
public void testMaxConcurrentAttempsWithSubsumption() {
	try {
		final int maxConcurrentAttempts = 2;
		final JobID jid = new JobID();

		// create some mock execution vertices and trigger some checkpoint
		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
		ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		CheckpointCoordinatorConfiguration chkConfig =
			new CheckpointCoordinatorConfigurationBuilder()
				.setCheckpointInterval(10) // periodic interval is 10 ms
				.setCheckpointTimeout(200000) // timeout is very long (200 s)
				.setMinPauseBetweenCheckpoints(0L) // no extra delay
				.setMaxConcurrentCheckpoints(maxConcurrentAttempts)
				.build();
		CheckpointCoordinator coord =
			new CheckpointCoordinatorBuilder()
				.setJobId(jid)
				.setCheckpointCoordinatorConfiguration(chkConfig)
				.setTasksToTrigger(new ExecutionVertex[] { triggerVertex })
				.setTasksToWaitFor(new ExecutionVertex[] { ackVertex })
				.setTasksToCommitTo(new ExecutionVertex[] { commitVertex })
				.setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2))
				.setTimer(manuallyTriggeredScheduledExecutor)
				.build();

		coord.startCheckpointScheduler();

		do {
			manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
			manuallyTriggeredScheduledExecutor.triggerAll();
		}
		while (coord.getNumberOfPendingCheckpoints() < maxConcurrentAttempts);

		// validate that the pending checkpoints are there
		assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
		assertNotNull(coord.getPendingCheckpoints().get(1L));
		assertNotNull(coord.getPendingCheckpoints().get(2L));

		// now we acknowledge the second checkpoint, which should subsume the first checkpoint
		// and allow two more checkpoints to be triggered
		// now, once we acknowledge one checkpoint, it should trigger the next one
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID, 2L), TASK_MANAGER_LOCATION_INFO);

		// after a while, there should be the new checkpoints
		do {
			manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
			manuallyTriggeredScheduledExecutor.triggerAll();
		}
		while (coord.getNumberOfPendingCheckpoints() < maxConcurrentAttempts);

		// do the final check
		assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
		assertNotNull(coord.getPendingCheckpoints().get(3L));
		assertNotNull(coord.getPendingCheckpoints().get(4L));

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example #17

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

private void testMaxConcurrentAttempts(int maxConcurrentAttempts) {
	try {
		final JobID jid = new JobID();

		// create some mock execution vertices and trigger some checkpoint
		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
		ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		final AtomicInteger numCalls = new AtomicInteger();

		final Execution execution = triggerVertex.getCurrentExecutionAttempt();

		doAnswer(invocation -> {
			numCalls.incrementAndGet();
			return null;
		}).when(execution).triggerCheckpoint(anyLong(), anyLong(), any(CheckpointOptions.class));

		doAnswer(invocation -> {
			numCalls.incrementAndGet();
			return null;
		}).when(execution).notifyCheckpointComplete(anyLong(), anyLong());

		CheckpointCoordinatorConfiguration chkConfig =
			new CheckpointCoordinatorConfigurationBuilder()
				.setCheckpointInterval(10) // periodic interval is 10 ms
				.setCheckpointTimeout(200000) // timeout is very long (200 s)
				.setMinPauseBetweenCheckpoints(0L) // no extra delay
				.setMaxConcurrentCheckpoints(maxConcurrentAttempts)
				.build();
		CheckpointCoordinator coord =
			new CheckpointCoordinatorBuilder()
				.setJobId(jid)
				.setCheckpointCoordinatorConfiguration(chkConfig)
				.setTasksToTrigger(new ExecutionVertex[] { triggerVertex })
				.setTasksToWaitFor(new ExecutionVertex[] { ackVertex })
				.setTasksToCommitTo(new ExecutionVertex[] { commitVertex })
				.setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2))
				.setTimer(manuallyTriggeredScheduledExecutor)
				.build();

		coord.startCheckpointScheduler();

		for (int i = 0; i < maxConcurrentAttempts; i++) {
			manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
			manuallyTriggeredScheduledExecutor.triggerAll();
		}

		assertEquals(maxConcurrentAttempts, numCalls.get());

		verify(triggerVertex.getCurrentExecutionAttempt(), times(maxConcurrentAttempts))
				.triggerCheckpoint(anyLong(), anyLong(), any(CheckpointOptions.class));

		// now, once we acknowledge one checkpoint, it should trigger the next one
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID, 1L), TASK_MANAGER_LOCATION_INFO);

		final Collection<ScheduledFuture<?>> periodicScheduledTasks =
			manuallyTriggeredScheduledExecutor.getPeriodicScheduledTask();
		assertEquals(1, periodicScheduledTasks.size());
		final ScheduledFuture scheduledFuture = periodicScheduledTasks.iterator().next();

		manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
		manuallyTriggeredScheduledExecutor.triggerAll();

		assertEquals(maxConcurrentAttempts + 1, numCalls.get());

		// no further checkpoints should happen
		manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
		manuallyTriggeredScheduledExecutor.triggerAll();
		assertEquals(maxConcurrentAttempts + 1, numCalls.get());

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example #18

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

/**
 * Triggers a savepoint and two checkpoints. The second checkpoint completes
 * and subsumes the first checkpoint, but not the first savepoint. Then we
 * trigger another checkpoint and savepoint. The 2nd savepoint completes and
 * subsumes the last checkpoint, but not the first savepoint.
 */
@Test
public void testSavepointsAreNotSubsumed() throws Exception {
	final JobID jid = new JobID();

	// create some mock Execution vertices that receive the checkpoint trigger messages
	final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
	final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
	ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
	ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);

	StandaloneCheckpointIDCounter counter = new StandaloneCheckpointIDCounter();

	// set up the coordinator and validate the initial state
	CheckpointCoordinator coord =
		new CheckpointCoordinatorBuilder()
			.setJobId(jid)
			.setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build())
			.setTasks(new ExecutionVertex[]{ vertex1, vertex2 })
			.setCheckpointIDCounter(counter)
			.setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(10))
			.setTimer(manuallyTriggeredScheduledExecutor)
			.build();

	String savepointDir = tmpFolder.newFolder().getAbsolutePath();

	// Trigger savepoint and checkpoint
	CompletableFuture<CompletedCheckpoint> savepointFuture1 = coord.triggerSavepoint(savepointDir);

	manuallyTriggeredScheduledExecutor.triggerAll();
	long savepointId1 = counter.getLast();
	assertEquals(1, coord.getNumberOfPendingCheckpoints());

	CompletableFuture<CompletedCheckpoint> checkpointFuture1 = coord.triggerCheckpoint(false);
	manuallyTriggeredScheduledExecutor.triggerAll();
	assertEquals(2, coord.getNumberOfPendingCheckpoints());
	assertFalse(checkpointFuture1.isCompletedExceptionally());

	CompletableFuture<CompletedCheckpoint> checkpointFuture2 = coord.triggerCheckpoint(false);
	manuallyTriggeredScheduledExecutor.triggerAll();
	assertFalse(checkpointFuture2.isCompletedExceptionally());
	long checkpointId2 = counter.getLast();
	assertEquals(3, coord.getNumberOfPendingCheckpoints());

	// 2nd checkpoint should subsume the 1st checkpoint, but not the savepoint
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, checkpointId2), TASK_MANAGER_LOCATION_INFO);
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId2), TASK_MANAGER_LOCATION_INFO);

	assertEquals(1, coord.getNumberOfPendingCheckpoints());
	assertEquals(1, coord.getNumberOfRetainedSuccessfulCheckpoints());

	assertFalse(coord.getPendingCheckpoints().get(savepointId1).isDiscarded());
	assertFalse(savepointFuture1.isDone());

	CompletableFuture<CompletedCheckpoint> checkpointFuture3 = coord.triggerCheckpoint(false);
	manuallyTriggeredScheduledExecutor.triggerAll();
	assertFalse(checkpointFuture3.isCompletedExceptionally());
	assertEquals(2, coord.getNumberOfPendingCheckpoints());

	CompletableFuture<CompletedCheckpoint> savepointFuture2 = coord.triggerSavepoint(savepointDir);
	manuallyTriggeredScheduledExecutor.triggerAll();
	long savepointId2 = counter.getLast();
	assertFalse(savepointFuture2.isCompletedExceptionally());
	assertEquals(3, coord.getNumberOfPendingCheckpoints());

	// 2nd savepoint should subsume the last checkpoint, but not the 1st savepoint
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, savepointId2), TASK_MANAGER_LOCATION_INFO);
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, savepointId2), TASK_MANAGER_LOCATION_INFO);

	assertEquals(1, coord.getNumberOfPendingCheckpoints());
	assertEquals(2, coord.getNumberOfRetainedSuccessfulCheckpoints());
	assertFalse(coord.getPendingCheckpoints().get(savepointId1).isDiscarded());

	assertFalse(savepointFuture1.isDone());
	assertNotNull(savepointFuture2.get());

	// Ack first savepoint
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID1, savepointId1), TASK_MANAGER_LOCATION_INFO);
	coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, savepointId1), TASK_MANAGER_LOCATION_INFO);

	assertEquals(0, coord.getNumberOfPendingCheckpoints());
	assertEquals(3, coord.getNumberOfRetainedSuccessfulCheckpoints());
	assertNotNull(savepointFuture1.get());
}

Example #19

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

@Test
public void testHandleMessagesForNonExistingCheckpoints() {
	try {
		final JobID jid = new JobID();

		// create some mock execution vertices and trigger some checkpoint

		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID1 = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID2 = new ExecutionAttemptID();
		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
		ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptID1);
		ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptID2);
		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		CheckpointCoordinator coord =
			new CheckpointCoordinatorBuilder()
				.setJobId(jid)
				.setTasksToTrigger(new ExecutionVertex[] { triggerVertex })
				.setTasksToWaitFor(new ExecutionVertex[] { ackVertex1, ackVertex2 })
				.setTasksToCommitTo(new ExecutionVertex[] { commitVertex })
				.setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2))
				.setTimer(manuallyTriggeredScheduledExecutor)
				.build();

		final CompletableFuture<CompletedCheckpoint> checkpointFuture = coord.triggerCheckpoint(false);
		manuallyTriggeredScheduledExecutor.triggerAll();
		assertFalse(checkpointFuture.isCompletedExceptionally());

		long checkpointId = coord.getPendingCheckpoints().keySet().iterator().next();

		// send some messages that do not belong to either the job or the any
		// of the vertices that need to be acknowledged.
		// non of the messages should throw an exception

		// wrong job id
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), ackAttemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);

		// unknown checkpoint
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, 1L), TASK_MANAGER_LOCATION_INFO);

		// unknown ack vertex
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, new ExecutionAttemptID(), checkpointId), TASK_MANAGER_LOCATION_INFO);

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example #20

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

@Test
public void testCheckpointTimeoutIsolated() {
	try {
		final JobID jid = new JobID();

		// create some mock execution vertices

		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();

		final ExecutionAttemptID ackAttemptID1 = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID2 = new ExecutionAttemptID();

		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);

		ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptID1);
		ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptID2);

		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		// set up the coordinator
		CheckpointCoordinator coord =
			new CheckpointCoordinatorBuilder()
				.setJobId(jid)
				.setTasksToTrigger(new ExecutionVertex[] { triggerVertex })
				.setTasksToWaitFor(new ExecutionVertex[] { ackVertex1, ackVertex2 })
				.setTasksToCommitTo(new ExecutionVertex[] { commitVertex })
				.setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2))
				.setTimer(manuallyTriggeredScheduledExecutor)
				.build();

		// trigger a checkpoint, partially acknowledged
		final CompletableFuture<CompletedCheckpoint> checkpointFuture = coord.triggerCheckpoint(false);
		manuallyTriggeredScheduledExecutor.triggerAll();
		assertFalse(checkpointFuture.isCompletedExceptionally());
		assertEquals(1, coord.getNumberOfPendingCheckpoints());

		PendingCheckpoint checkpoint = coord.getPendingCheckpoints().values().iterator().next();
		assertFalse(checkpoint.isDiscarded());

		OperatorID opID1 = OperatorID.fromJobVertexID(ackVertex1.getJobvertexId());

		TaskStateSnapshot taskOperatorSubtaskStates1 = spy(new TaskStateSnapshot());
		OperatorSubtaskState subtaskState1 = mock(OperatorSubtaskState.class);
		taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);

		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, checkpoint.getCheckpointId(), new CheckpointMetrics(), taskOperatorSubtaskStates1), TASK_MANAGER_LOCATION_INFO);

		// triggers cancelling
		manuallyTriggeredScheduledExecutor.triggerScheduledTasks();
		assertTrue("Checkpoint was not canceled by the timeout", checkpoint.isDiscarded());
		assertEquals(0, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// validate that the received states have been discarded
		verify(subtaskState1, times(1)).discardState();

		// no confirm message must have been sent
		verify(commitVertex.getCurrentExecutionAttempt(), times(0)).notifyCheckpointComplete(anyLong(), anyLong());

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example #21

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

/**
 * This test triggers a checkpoint and then sends a decline checkpoint message from
 * one of the tasks. The expected behaviour is that said checkpoint is discarded and a new
 * checkpoint is triggered.
 */
@Test
public void testTriggerAndDeclineCheckpointSimple() {
	try {
		final JobID jid = new JobID();

		// create some mock Execution vertices that receive the checkpoint trigger messages
		final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
		final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
		ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
		ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);

		// set up the coordinator and validate the initial state
		CheckpointCoordinator coord = getCheckpointCoordinator(jid, vertex1, vertex2);

		assertEquals(0, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// trigger the first checkpoint. this should succeed
		final CompletableFuture<CompletedCheckpoint> checkpointFuture = coord.triggerCheckpoint(false);
		manuallyTriggeredScheduledExecutor.triggerAll();
		assertFalse(checkpointFuture.isCompletedExceptionally());

		// validate that we have a pending checkpoint
		assertEquals(1, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// we have one task scheduled that will cancel after timeout
		assertEquals(1, manuallyTriggeredScheduledExecutor.getScheduledTasks().size());

		long checkpointId = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
		PendingCheckpoint checkpoint = coord.getPendingCheckpoints().get(checkpointId);

		assertNotNull(checkpoint);
		assertEquals(checkpointId, checkpoint.getCheckpointId());
		assertEquals(jid, checkpoint.getJobId());
		assertEquals(2, checkpoint.getNumberOfNonAcknowledgedTasks());
		assertEquals(0, checkpoint.getNumberOfAcknowledgedTasks());
		assertEquals(0, checkpoint.getOperatorStates().size());
		assertFalse(checkpoint.isDiscarded());
		assertFalse(checkpoint.areTasksFullyAcknowledged());

		// check that the vertices received the trigger checkpoint message
		verify(vertex1.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, checkpoint.getCheckpointTimestamp(), CheckpointOptions.forCheckpointWithDefaultLocation());
		verify(vertex2.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, checkpoint.getCheckpointTimestamp(), CheckpointOptions.forCheckpointWithDefaultLocation());

		// acknowledge from one of the tasks
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId), "Unknown location");
		assertEquals(1, checkpoint.getNumberOfAcknowledgedTasks());
		assertEquals(1, checkpoint.getNumberOfNonAcknowledgedTasks());
		assertFalse(checkpoint.isDiscarded());
		assertFalse(checkpoint.areTasksFullyAcknowledged());

		// acknowledge the same task again (should not matter)
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId), "Unknown location");
		assertFalse(checkpoint.isDiscarded());
		assertFalse(checkpoint.areTasksFullyAcknowledged());

		// decline checkpoint from the other task, this should cancel the checkpoint
		// and trigger a new one
		coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);
		assertTrue(checkpoint.isDiscarded());

		// the canceler is also removed
		assertEquals(0, manuallyTriggeredScheduledExecutor.getScheduledTasks().size());

		// validate that we have no new pending checkpoint
		assertEquals(0, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// decline again, nothing should happen
		// decline from the other task, nothing should happen
		coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);
		coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID2, checkpointId), TASK_MANAGER_LOCATION_INFO);
		assertTrue(checkpoint.isDiscarded());

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example #22

Source File: CheckpointCoordinatorTest.java From Flink-CEPplus with Apache License 2.0

4 votes

@Test
public void testCheckpointTimeoutIsolated() {
	try {
		final JobID jid = new JobID();
		final long timestamp = System.currentTimeMillis();

		// create some mock execution vertices

		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();

		final ExecutionAttemptID ackAttemptID1 = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID2 = new ExecutionAttemptID();

		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);

		ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptID1);
		ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptID2);

		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		// set up the coordinator
		// the timeout for the checkpoint is a 200 milliseconds

		CheckpointCoordinator coord = new CheckpointCoordinator(
			jid,
			600000,
			200,
			0,
			Integer.MAX_VALUE,
			CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
			new ExecutionVertex[] { triggerVertex },
			new ExecutionVertex[] { ackVertex1, ackVertex2 },
			new ExecutionVertex[] { commitVertex },
			new StandaloneCheckpointIDCounter(),
			new StandaloneCompletedCheckpointStore(2),
			new MemoryStateBackend(),
			Executors.directExecutor(),
			SharedStateRegistry.DEFAULT_FACTORY);

		// trigger a checkpoint, partially acknowledged
		assertTrue(coord.triggerCheckpoint(timestamp, false));
		assertEquals(1, coord.getNumberOfPendingCheckpoints());

		PendingCheckpoint checkpoint = coord.getPendingCheckpoints().values().iterator().next();
		assertFalse(checkpoint.isDiscarded());

		OperatorID opID1 = OperatorID.fromJobVertexID(ackVertex1.getJobvertexId());

		TaskStateSnapshot taskOperatorSubtaskStates1 = spy(new TaskStateSnapshot());
		OperatorSubtaskState subtaskState1 = mock(OperatorSubtaskState.class);
		taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);

		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, checkpoint.getCheckpointId(), new CheckpointMetrics(), taskOperatorSubtaskStates1));

		// wait until the checkpoint must have expired.
		// we check every 250 msecs conservatively for 5 seconds
		// to give even slow build servers a very good chance of completing this
		long deadline = System.currentTimeMillis() + 5000;
		do {
			Thread.sleep(250);
		}
		while (!checkpoint.isDiscarded() &&
				coord.getNumberOfPendingCheckpoints() > 0 &&
				System.currentTimeMillis() < deadline);

		assertTrue("Checkpoint was not canceled by the timeout", checkpoint.isDiscarded());
		assertEquals(0, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// validate that the received states have been discarded
		verify(subtaskState1, times(1)).discardState();

		// no confirm message must have been sent
		verify(commitVertex.getCurrentExecutionAttempt(), times(0)).notifyCheckpointComplete(anyLong(), anyLong());

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example #23

Source File: CheckpointCoordinator.java From flink with Apache License 2.0

4 votes

/**
 * Receives an AcknowledgeCheckpoint message and returns whether the
 * message was associated with a pending checkpoint.
 *
 * @param message Checkpoint ack from the task manager
 *
 * @param taskManagerLocationInfo The location of the acknowledge checkpoint message's sender
 * @return Flag indicating whether the ack'd checkpoint was associated
 * with a pending checkpoint.
 *
 * @throws CheckpointException If the checkpoint cannot be added to the completed checkpoint store.
 */
public boolean receiveAcknowledgeMessage(AcknowledgeCheckpoint message, String taskManagerLocationInfo) throws CheckpointException {
	if (shutdown || message == null) {
		return false;
	}

	if (!job.equals(message.getJob())) {
		LOG.error("Received wrong AcknowledgeCheckpoint message for job {} from {} : {}", job, taskManagerLocationInfo, message);
		return false;
	}

	final long checkpointId = message.getCheckpointId();

	synchronized (lock) {
		// we need to check inside the lock for being shutdown as well, otherwise we
		// get races and invalid error log messages
		if (shutdown) {
			return false;
		}

		final PendingCheckpoint checkpoint = pendingCheckpoints.get(checkpointId);

		if (checkpoint != null && !checkpoint.isDiscarded()) {

			switch (checkpoint.acknowledgeTask(message.getTaskExecutionId(), message.getSubtaskState(), message.getCheckpointMetrics())) {
				case SUCCESS:
					LOG.debug("Received acknowledge message for checkpoint {} from task {} of job {} at {}.",
						checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo);

					if (checkpoint.areTasksFullyAcknowledged()) {
						completePendingCheckpoint(checkpoint);
					}
					break;
				case DUPLICATE:
					LOG.debug("Received a duplicate acknowledge message for checkpoint {}, task {}, job {}, location {}.",
						message.getCheckpointId(), message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo);
					break;
				case UNKNOWN:
					LOG.warn("Could not acknowledge the checkpoint {} for task {} of job {} at {}, " +
							"because the task's execution attempt id was unknown. Discarding " +
							"the state handle to avoid lingering state.", message.getCheckpointId(),
						message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo);

					discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());

					break;
				case DISCARDED:
					LOG.warn("Could not acknowledge the checkpoint {} for task {} of job {} at {}, " +
						"because the pending checkpoint had been discarded. Discarding the " +
							"state handle tp avoid lingering state.",
						message.getCheckpointId(), message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo);

					discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());
			}

			return true;
		}
		else if (checkpoint != null) {
			// this should not happen
			throw new IllegalStateException(
					"Received message for discarded but non-removed checkpoint " + checkpointId);
		}
		else {
			boolean wasPendingCheckpoint;

			// message is for an unknown checkpoint, or comes too late (checkpoint disposed)
			if (recentPendingCheckpoints.contains(checkpointId)) {
				wasPendingCheckpoint = true;
				LOG.warn("Received late message for now expired checkpoint attempt {} from task " +
					"{} of job {} at {}.", checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo);
			}
			else {
				LOG.debug("Received message for an unknown checkpoint {} from task {} of job {} at {}.",
					checkpointId, message.getTaskExecutionId(), message.getJob(), taskManagerLocationInfo);
				wasPendingCheckpoint = false;
			}

			// try to discard the state so that we don't have lingering state lying around
			discardSubtaskState(message.getJob(), message.getTaskExecutionId(), message.getCheckpointId(), message.getSubtaskState());

			return wasPendingCheckpoint;
		}
	}
}

Example #24

Source File: CheckpointCoordinatorTest.java From Flink-CEPplus with Apache License 2.0

4 votes

@Test
public void testHandleMessagesForNonExistingCheckpoints() {
	try {
		final JobID jid = new JobID();
		final long timestamp = System.currentTimeMillis();

		// create some mock execution vertices and trigger some checkpoint

		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID1 = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID2 = new ExecutionAttemptID();
		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
		ExecutionVertex ackVertex1 = mockExecutionVertex(ackAttemptID1);
		ExecutionVertex ackVertex2 = mockExecutionVertex(ackAttemptID2);
		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		CheckpointCoordinator coord = new CheckpointCoordinator(
			jid,
			200000,
			200000,
			0,
			Integer.MAX_VALUE,
			CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
			new ExecutionVertex[] { triggerVertex },
			new ExecutionVertex[] { ackVertex1, ackVertex2 },
			new ExecutionVertex[] { commitVertex },
			new StandaloneCheckpointIDCounter(),
			new StandaloneCompletedCheckpointStore(2),
			new MemoryStateBackend(),
			Executors.directExecutor(),
			SharedStateRegistry.DEFAULT_FACTORY);

		assertTrue(coord.triggerCheckpoint(timestamp, false));

		long checkpointId = coord.getPendingCheckpoints().keySet().iterator().next();

		// send some messages that do not belong to either the job or the any
		// of the vertices that need to be acknowledged.
		// non of the messages should throw an exception

		// wrong job id
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), ackAttemptID1, checkpointId));

		// unknown checkpoint
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID1, 1L));

		// unknown ack vertex
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, new ExecutionAttemptID(), checkpointId));

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example #25

Source File: CheckpointCoordinatorFailureTest.java From Flink-CEPplus with Apache License 2.0

4 votes

/**
 * Tests that a failure while storing a completed checkpoint in the completed checkpoint store
 * will properly fail the originating pending checkpoint and clean upt the completed checkpoint.
 */
@Test
public void testFailingCompletedCheckpointStoreAdd() throws Exception {
	JobID jid = new JobID();

	final ExecutionAttemptID executionAttemptId = new ExecutionAttemptID();
	final ExecutionVertex vertex = CheckpointCoordinatorTest.mockExecutionVertex(executionAttemptId);

	final long triggerTimestamp = 1L;

	// set up the coordinator and validate the initial state
	CheckpointCoordinator coord = new CheckpointCoordinator(
		jid,
		600000,
		600000,
		0,
		Integer.MAX_VALUE,
		CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
		new ExecutionVertex[]{vertex},
		new ExecutionVertex[]{vertex},
		new ExecutionVertex[]{vertex},
		new StandaloneCheckpointIDCounter(),
		new FailingCompletedCheckpointStore(),
		new MemoryStateBackend(),
		Executors.directExecutor(),
		SharedStateRegistry.DEFAULT_FACTORY);

	coord.triggerCheckpoint(triggerTimestamp, false);

	assertEquals(1, coord.getNumberOfPendingCheckpoints());

	PendingCheckpoint pendingCheckpoint = coord.getPendingCheckpoints().values().iterator().next();

	assertFalse(pendingCheckpoint.isDiscarded());

	final long checkpointId = coord.getPendingCheckpoints().keySet().iterator().next();

	KeyedStateHandle managedKeyedHandle = mock(KeyedStateHandle.class);
	KeyedStateHandle rawKeyedHandle = mock(KeyedStateHandle.class);
	OperatorStateHandle managedOpHandle = mock(OperatorStreamStateHandle.class);
	OperatorStateHandle rawOpHandle = mock(OperatorStreamStateHandle.class);

	final OperatorSubtaskState operatorSubtaskState = spy(new OperatorSubtaskState(
		managedOpHandle,
		rawOpHandle,
		managedKeyedHandle,
		rawKeyedHandle));

	TaskStateSnapshot subtaskState = spy(new TaskStateSnapshot());
	subtaskState.putSubtaskStateByOperatorID(new OperatorID(), operatorSubtaskState);

	when(subtaskState.getSubtaskStateByOperatorID(OperatorID.fromJobVertexID(vertex.getJobvertexId()))).thenReturn(operatorSubtaskState);

	AcknowledgeCheckpoint acknowledgeMessage = new AcknowledgeCheckpoint(jid, executionAttemptId, checkpointId, new CheckpointMetrics(), subtaskState);

	try {
		coord.receiveAcknowledgeMessage(acknowledgeMessage);
		fail("Expected a checkpoint exception because the completed checkpoint store could not " +
			"store the completed checkpoint.");
	} catch (CheckpointException e) {
		// ignore because we expected this exception
	}

	// make sure that the pending checkpoint has been discarded after we could not complete it
	assertTrue(pendingCheckpoint.isDiscarded());

	// make sure that the subtask state has been discarded after we could not complete it.
	verify(operatorSubtaskState).discardState();
	verify(operatorSubtaskState.getManagedOperatorState().iterator().next()).discardState();
	verify(operatorSubtaskState.getRawOperatorState().iterator().next()).discardState();
	verify(operatorSubtaskState.getManagedKeyedState().iterator().next()).discardState();
	verify(operatorSubtaskState.getRawKeyedState().iterator().next()).discardState();
}

Example #26

Source File: CheckpointCoordinatorTest.java From Flink-CEPplus with Apache License 2.0

4 votes

private void testMaxConcurrentAttempts(int maxConcurrentAttempts) {
	try {
		final JobID jid = new JobID();

		// create some mock execution vertices and trigger some checkpoint
		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
		ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		final AtomicInteger numCalls = new AtomicInteger();

		final Execution execution = triggerVertex.getCurrentExecutionAttempt();

		doAnswer(invocation -> {
			numCalls.incrementAndGet();
			return null;
		}).when(execution).triggerCheckpoint(anyLong(), anyLong(), any(CheckpointOptions.class));

		doAnswer(invocation -> {
			numCalls.incrementAndGet();
			return null;
		}).when(execution).notifyCheckpointComplete(anyLong(), anyLong());

		CheckpointCoordinator coord = new CheckpointCoordinator(
			jid,
			10,        // periodic interval is 10 ms
			200000,    // timeout is very long (200 s)
			0L,        // no extra delay
			maxConcurrentAttempts,
			CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
			new ExecutionVertex[] { triggerVertex },
			new ExecutionVertex[] { ackVertex },
			new ExecutionVertex[] { commitVertex },
			new StandaloneCheckpointIDCounter(),
			new StandaloneCompletedCheckpointStore(2),
			new MemoryStateBackend(),
			Executors.directExecutor(),
			SharedStateRegistry.DEFAULT_FACTORY);

		coord.startCheckpointScheduler();

		// after a while, there should be exactly as many checkpoints
		// as concurrently permitted
		long now = System.currentTimeMillis();
		long timeout = now + 60000;
		long minDuration = now + 100;
		do {
			Thread.sleep(20);
		}
		while ((now = System.currentTimeMillis()) < minDuration ||
				(numCalls.get() < maxConcurrentAttempts && now < timeout));

		assertEquals(maxConcurrentAttempts, numCalls.get());

		verify(triggerVertex.getCurrentExecutionAttempt(), times(maxConcurrentAttempts))
				.triggerCheckpoint(anyLong(), anyLong(), any(CheckpointOptions.class));

		// now, once we acknowledge one checkpoint, it should trigger the next one
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID, 1L));

		// this should have immediately triggered a new checkpoint
		now = System.currentTimeMillis();
		timeout = now + 60000;
		do {
			Thread.sleep(20);
		}
		while (numCalls.get() < maxConcurrentAttempts + 1 && now < timeout);

		assertEquals(maxConcurrentAttempts + 1, numCalls.get());

		// no further checkpoints should happen
		Thread.sleep(200);
		assertEquals(maxConcurrentAttempts + 1, numCalls.get());

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}

Example #27

Source File: CheckpointCoordinatorMasterHooksTest.java From flink with Apache License 2.0

4 votes

@Test
public void testHooksAreCalledOnTrigger() throws Exception {
	final String id1 = "id1";
	final String id2 = "id2";

	final String state1 = "the-test-string-state";
	final byte[] state1serialized = new StringSerializer().serialize(state1);

	final long state2 = 987654321L;
	final byte[] state2serialized = new LongSerializer().serialize(state2);

	final MasterTriggerRestoreHook<String> statefulHook1 = mockGeneric(MasterTriggerRestoreHook.class);
	when(statefulHook1.getIdentifier()).thenReturn(id1);
	when(statefulHook1.createCheckpointDataSerializer()).thenReturn(new StringSerializer());
	when(statefulHook1.triggerCheckpoint(anyLong(), anyLong(), any(Executor.class)))
			.thenReturn(CompletableFuture.completedFuture(state1));

	final MasterTriggerRestoreHook<Long> statefulHook2 = mockGeneric(MasterTriggerRestoreHook.class);
	when(statefulHook2.getIdentifier()).thenReturn(id2);
	when(statefulHook2.createCheckpointDataSerializer()).thenReturn(new LongSerializer());
	when(statefulHook2.triggerCheckpoint(anyLong(), anyLong(), any(Executor.class)))
			.thenReturn(CompletableFuture.completedFuture(state2));

	final MasterTriggerRestoreHook<Void> statelessHook = mockGeneric(MasterTriggerRestoreHook.class);
	when(statelessHook.getIdentifier()).thenReturn("some-id");

	// create the checkpoint coordinator
	final JobID jid = new JobID();
	final ExecutionAttemptID execId = new ExecutionAttemptID();
	final ExecutionVertex ackVertex = mockExecutionVertex(execId);
	final CheckpointCoordinator cc = instantiateCheckpointCoordinator(jid, ackVertex);

	cc.addMasterHook(statefulHook1);
	cc.addMasterHook(statelessHook);
	cc.addMasterHook(statefulHook2);

	// trigger a checkpoint
	assertTrue(cc.triggerCheckpoint(System.currentTimeMillis(), false));
	assertEquals(1, cc.getNumberOfPendingCheckpoints());

	verify(statefulHook1, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));
	verify(statefulHook2, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));
	verify(statelessHook, times(1)).triggerCheckpoint(anyLong(), anyLong(), any(Executor.class));

	final long checkpointId = cc.getPendingCheckpoints().values().iterator().next().getCheckpointId();
	cc.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, execId, checkpointId), "Unknown location");
	assertEquals(0, cc.getNumberOfPendingCheckpoints());

	assertEquals(1, cc.getNumberOfRetainedSuccessfulCheckpoints());
	final CompletedCheckpoint chk = cc.getCheckpointStore().getLatestCheckpoint(false);

	final Collection<MasterState> masterStates = chk.getMasterHookStates();
	assertEquals(2, masterStates.size());

	for (MasterState ms : masterStates) {
		if (ms.name().equals(id1)) {
			assertArrayEquals(state1serialized, ms.bytes());
			assertEquals(StringSerializer.VERSION, ms.version());
		}
		else if (ms.name().equals(id2)) {
			assertArrayEquals(state2serialized, ms.bytes());
			assertEquals(LongSerializer.VERSION, ms.version());
		}
		else {
			fail("unrecognized state name: " + ms.name());
		}
	}
}

Example #28

Source File: CheckpointCoordinatorFailureTest.java From flink with Apache License 2.0

4 votes

/**
 * Tests that a failure while storing a completed checkpoint in the completed checkpoint store
 * will properly fail the originating pending checkpoint and clean upt the completed checkpoint.
 */
@Test
public void testFailingCompletedCheckpointStoreAdd() throws Exception {
	JobID jid = new JobID();

	final ExecutionAttemptID executionAttemptId = new ExecutionAttemptID();
	final ExecutionVertex vertex = CheckpointCoordinatorTest.mockExecutionVertex(executionAttemptId);

	final long triggerTimestamp = 1L;

	CheckpointFailureManager failureManager = new CheckpointFailureManager(
		0,
		NoOpFailJobCall.INSTANCE);

	// set up the coordinator and validate the initial state
	CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration(
		600000,
		600000,
		0,
		Integer.MAX_VALUE,
		CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
		true,
		false,
		0);
	CheckpointCoordinator coord = new CheckpointCoordinator(
		jid,
		chkConfig,
		new ExecutionVertex[]{vertex},
		new ExecutionVertex[]{vertex},
		new ExecutionVertex[]{vertex},
		new StandaloneCheckpointIDCounter(),
		new FailingCompletedCheckpointStore(),
		new MemoryStateBackend(),
		Executors.directExecutor(),
		SharedStateRegistry.DEFAULT_FACTORY,
		failureManager);

	coord.triggerCheckpoint(triggerTimestamp, false);

	assertEquals(1, coord.getNumberOfPendingCheckpoints());

	PendingCheckpoint pendingCheckpoint = coord.getPendingCheckpoints().values().iterator().next();

	assertFalse(pendingCheckpoint.isDiscarded());

	final long checkpointId = coord.getPendingCheckpoints().keySet().iterator().next();

	KeyedStateHandle managedKeyedHandle = mock(KeyedStateHandle.class);
	KeyedStateHandle rawKeyedHandle = mock(KeyedStateHandle.class);
	OperatorStateHandle managedOpHandle = mock(OperatorStreamStateHandle.class);
	OperatorStateHandle rawOpHandle = mock(OperatorStreamStateHandle.class);

	final OperatorSubtaskState operatorSubtaskState = spy(new OperatorSubtaskState(
		managedOpHandle,
		rawOpHandle,
		managedKeyedHandle,
		rawKeyedHandle));

	TaskStateSnapshot subtaskState = spy(new TaskStateSnapshot());
	subtaskState.putSubtaskStateByOperatorID(new OperatorID(), operatorSubtaskState);

	when(subtaskState.getSubtaskStateByOperatorID(OperatorID.fromJobVertexID(vertex.getJobvertexId()))).thenReturn(operatorSubtaskState);

	AcknowledgeCheckpoint acknowledgeMessage = new AcknowledgeCheckpoint(jid, executionAttemptId, checkpointId, new CheckpointMetrics(), subtaskState);

	try {
		coord.receiveAcknowledgeMessage(acknowledgeMessage, "Unknown location");
		fail("Expected a checkpoint exception because the completed checkpoint store could not " +
			"store the completed checkpoint.");
	} catch (CheckpointException e) {
		// ignore because we expected this exception
	}

	// make sure that the pending checkpoint has been discarded after we could not complete it
	assertTrue(pendingCheckpoint.isDiscarded());

	// make sure that the subtask state has been discarded after we could not complete it.
	verify(operatorSubtaskState).discardState();
	verify(operatorSubtaskState.getManagedOperatorState().iterator().next()).discardState();
	verify(operatorSubtaskState.getRawOperatorState().iterator().next()).discardState();
	verify(operatorSubtaskState.getManagedKeyedState().iterator().next()).discardState();
	verify(operatorSubtaskState.getRawKeyedState().iterator().next()).discardState();
}

Example #29

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

private void performIncrementalCheckpoint(
	JobID jid,
	CheckpointCoordinator coord,
	ExecutionJobVertex jobVertex1,
	List<KeyGroupRange> keyGroupPartitions1,
	long timestamp,
	int cpSequenceNumber) throws Exception {

	// trigger the checkpoint
	coord.triggerCheckpoint(timestamp, false);

	assertTrue(coord.getPendingCheckpoints().keySet().size() == 1);
	long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());

	for (int index = 0; index < jobVertex1.getParallelism(); index++) {

		KeyGroupRange keyGroupRange = keyGroupPartitions1.get(index);

		Map<StateHandleID, StreamStateHandle> privateState = new HashMap<>();
		privateState.put(
			new StateHandleID("private-1"),
			spy(new ByteStreamStateHandle("private-1", new byte[]{'p'})));

		Map<StateHandleID, StreamStateHandle> sharedState = new HashMap<>();

		// let all but the first CP overlap by one shared state.
		if (cpSequenceNumber > 0) {
			sharedState.put(
				new StateHandleID("shared-" + (cpSequenceNumber - 1)),
				spy(new PlaceholderStreamStateHandle()));
		}

		sharedState.put(
			new StateHandleID("shared-" + cpSequenceNumber),
			spy(new ByteStreamStateHandle("shared-" + cpSequenceNumber + "-" + keyGroupRange, new byte[]{'s'})));

		IncrementalRemoteKeyedStateHandle managedState =
			spy(new IncrementalRemoteKeyedStateHandle(
				new UUID(42L, 42L),
				keyGroupRange,
				checkpointId,
				sharedState,
				privateState,
				spy(new ByteStreamStateHandle("meta", new byte[]{'m'}))));

		OperatorSubtaskState operatorSubtaskState =
			spy(new OperatorSubtaskState(
				StateObjectCollection.empty(),
				StateObjectCollection.empty(),
				StateObjectCollection.singleton(managedState),
				StateObjectCollection.empty()));

		Map<OperatorID, OperatorSubtaskState> opStates = new HashMap<>();

		opStates.put(jobVertex1.getOperatorIDs().get(0), operatorSubtaskState);

		TaskStateSnapshot taskStateSnapshot = new TaskStateSnapshot(opStates);

		AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(
			jid,
			jobVertex1.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(),
			checkpointId,
			new CheckpointMetrics(),
			taskStateSnapshot);

		coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
	}
}

Example #30

Source File: CheckpointCoordinatorTest.java From flink with Apache License 2.0

4 votes

@Test
public void testMaxConcurrentAttempsWithSubsumption() {
	try {
		final int maxConcurrentAttempts = 2;
		final JobID jid = new JobID();

		// create some mock execution vertices and trigger some checkpoint
		final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
		final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();

		ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
		ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
		ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);

		CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration(
			10,        // periodic interval is 10 ms
			200000,    // timeout is very long (200 s)
			0L,        // no extra delay
			maxConcurrentAttempts, // max two concurrent checkpoints
			CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
			true,
			false,
			0);
		CheckpointCoordinator coord = new CheckpointCoordinator(
			jid,
			chkConfig,
			new ExecutionVertex[] { triggerVertex },
			new ExecutionVertex[] { ackVertex },
			new ExecutionVertex[] { commitVertex },
			new StandaloneCheckpointIDCounter(),
			new StandaloneCompletedCheckpointStore(2),
			new MemoryStateBackend(),
			Executors.directExecutor(),
			SharedStateRegistry.DEFAULT_FACTORY,
			failureManager);

		coord.startCheckpointScheduler();

		// after a while, there should be exactly as many checkpoints
		// as concurrently permitted
		long now = System.currentTimeMillis();
		long timeout = now + 60000;
		long minDuration = now + 100;
		do {
			Thread.sleep(20);
		}
		while ((now = System.currentTimeMillis()) < minDuration ||
				(coord.getNumberOfPendingCheckpoints() < maxConcurrentAttempts && now < timeout));

		// validate that the pending checkpoints are there
		assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
		assertNotNull(coord.getPendingCheckpoints().get(1L));
		assertNotNull(coord.getPendingCheckpoints().get(2L));

		// now we acknowledge the second checkpoint, which should subsume the first checkpoint
		// and allow two more checkpoints to be triggered
		// now, once we acknowledge one checkpoint, it should trigger the next one
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, ackAttemptID, 2L), TASK_MANAGER_LOCATION_INFO);

		// after a while, there should be the new checkpoints
		final long newTimeout = System.currentTimeMillis() + 60000;
		do {
			Thread.sleep(20);
		}
		while (coord.getPendingCheckpoints().get(4L) == null &&
				System.currentTimeMillis() < newTimeout);

		// do the final check
		assertEquals(maxConcurrentAttempts, coord.getNumberOfPendingCheckpoints());
		assertNotNull(coord.getPendingCheckpoints().get(3L));
		assertNotNull(coord.getPendingCheckpoints().get(4L));

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}