org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint Java Examples

The following examples show how to use org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: JobMaster.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Override
public void declineCheckpoint(DeclineCheckpoint decline) {
	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();

	if (checkpointCoordinator != null) {
		getRpcService().execute(() -> {
			try {
				checkpointCoordinator.receiveDeclineMessage(decline);
			} catch (Exception e) {
				log.error("Error in CheckpointCoordinator while processing {}", decline, e);
			}
		});
	} else {
		String errorMessage = "Received DeclineCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}
 
Example #2
Source File: SchedulerBase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
public void declineCheckpoint(final DeclineCheckpoint decline) {
	mainThreadExecutor.assertRunningInMainThread();

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	final String taskManagerLocationInfo = retrieveTaskManagerLocation(decline.getTaskExecutionId());

	if (checkpointCoordinator != null) {
		ioExecutor.execute(() -> {
			try {
				checkpointCoordinator.receiveDeclineMessage(decline, taskManagerLocationInfo);
			} catch (Exception e) {
				log.error("Error in CheckpointCoordinator while processing {}", decline, e);
			}
		});
	} else {
		String errorMessage = "Received DeclineCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}
 
Example #3
Source File: LegacyScheduler.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
public void declineCheckpoint(final DeclineCheckpoint decline) {
	mainThreadExecutor.assertRunningInMainThread();

	final CheckpointCoordinator checkpointCoordinator = executionGraph.getCheckpointCoordinator();
	final String taskManagerLocationInfo = retrieveTaskManagerLocation(decline.getTaskExecutionId());

	if (checkpointCoordinator != null) {
		ioExecutor.execute(() -> {
			try {
				checkpointCoordinator.receiveDeclineMessage(decline, taskManagerLocationInfo);
			} catch (Exception e) {
				log.error("Error in CheckpointCoordinator while processing {}", decline, e);
			}
		});
	} else {
		String errorMessage = "Received DeclineCheckpoint message for job {} with no CheckpointCoordinator";
		if (executionGraph.getState() == JobStatus.RUNNING) {
			log.error(errorMessage, jobGraph.getJobID());
		} else {
			log.debug(errorMessage, jobGraph.getJobID());
		}
	}
}
 
Example #4
Source File: ActorGatewayCheckpointResponder.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
public void declineCheckpoint(
		JobID jobID,
		ExecutionAttemptID executionAttemptID,
		long checkpointId,
		Throwable reason) {

	DeclineCheckpoint decline = new DeclineCheckpoint(
		jobID,
		executionAttemptID,
		checkpointId,
		reason);

	actorGateway.tell(decline);
}
 
Example #5
Source File: CheckpointCoordinatorTest.java    From flink with Apache License 2.0 5 votes vote down vote up
private PendingCheckpoint declineSynchronousSavepoint(
		final JobID jobId,
		final CheckpointCoordinator coordinator,
		final ExecutionAttemptID attemptID,
		final Throwable reason) {

	final long checkpointId = coordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
	final PendingCheckpoint checkpoint = coordinator.getPendingCheckpoints().get(checkpointId);
	coordinator.receiveDeclineMessage(new DeclineCheckpoint(jobId, attemptID, checkpointId, reason), TASK_MANAGER_LOCATION_INFO);
	return checkpoint;
}
 
Example #6
Source File: RpcCheckpointResponder.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public void declineCheckpoint(
		JobID jobID,
		ExecutionAttemptID executionAttemptID,
		long checkpointId,
		Throwable cause) {

	checkpointCoordinatorGateway.declineCheckpoint(new DeclineCheckpoint(jobID,
		executionAttemptID,
		checkpointId,
		cause));
}
 
Example #7
Source File: CheckpointCoordinatorTest.java    From flink with Apache License 2.0 5 votes vote down vote up
private PendingCheckpoint declineSynchronousSavepoint(
		final JobID jobId,
		final CheckpointCoordinator coordinator,
		final ExecutionAttemptID attemptID,
		final Throwable reason) {

	final long checkpointId = coordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
	final PendingCheckpoint checkpoint = coordinator.getPendingCheckpoints().get(checkpointId);
	coordinator.receiveDeclineMessage(new DeclineCheckpoint(jobId, attemptID, checkpointId, reason), TASK_MANAGER_LOCATION_INFO);
	return checkpoint;
}
 
Example #8
Source File: RpcCheckpointResponder.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public void declineCheckpoint(
		JobID jobID,
		ExecutionAttemptID executionAttemptID,
		long checkpointId,
		Throwable cause) {

	checkpointCoordinatorGateway.declineCheckpoint(new DeclineCheckpoint(jobID,
		executionAttemptID,
		checkpointId,
		cause));
}
 
Example #9
Source File: RpcCheckpointResponder.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
public void declineCheckpoint(
		JobID jobID,
		ExecutionAttemptID executionAttemptID,
		long checkpointId,
		Throwable cause) {

	checkpointCoordinatorGateway.declineCheckpoint(new DeclineCheckpoint(jobID,
		executionAttemptID,
		checkpointId,
		cause));
}
 
Example #10
Source File: JobMaster.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
public void declineCheckpoint(DeclineCheckpoint decline) {
	schedulerNG.declineCheckpoint(decline);
}
 
Example #11
Source File: TestingJobMasterGateway.java    From flink with Apache License 2.0 4 votes vote down vote up
public TestingJobMasterGateway(
		@Nonnull String address,
		@Nonnull String hostname,
		@Nonnull Supplier<CompletableFuture<Acknowledge>> cancelFunction,
		@Nonnull Function<TaskExecutionState, CompletableFuture<Acknowledge>> updateTaskExecutionStateFunction,
		@Nonnull BiFunction<JobVertexID, ExecutionAttemptID, CompletableFuture<SerializedInputSplit>> requestNextInputSplitFunction,
		@Nonnull BiFunction<IntermediateDataSetID, ResultPartitionID, CompletableFuture<ExecutionState>> requestPartitionStateFunction,
		@Nonnull Function<ResultPartitionID, CompletableFuture<Acknowledge>> scheduleOrUpdateConsumersFunction,
		@Nonnull Function<ResourceID, CompletableFuture<Acknowledge>> disconnectTaskManagerFunction,
		@Nonnull Consumer<ResourceManagerId> disconnectResourceManagerConsumer,
		@Nonnull BiFunction<ResourceID, Collection<SlotOffer>, CompletableFuture<Collection<SlotOffer>>> offerSlotsFunction,
		@Nonnull TriConsumer<ResourceID, AllocationID, Throwable> failSlotConsumer,
		@Nonnull BiFunction<String, UnresolvedTaskManagerLocation, CompletableFuture<RegistrationResponse>> registerTaskManagerFunction,
		@Nonnull BiConsumer<ResourceID, AccumulatorReport> taskManagerHeartbeatConsumer,
		@Nonnull Consumer<ResourceID> resourceManagerHeartbeatConsumer,
		@Nonnull Supplier<CompletableFuture<JobDetails>> requestJobDetailsSupplier,
		@Nonnull Supplier<CompletableFuture<ArchivedExecutionGraph>> requestJobSupplier,
		@Nonnull BiFunction<String, Boolean, CompletableFuture<String>> triggerSavepointFunction,
		@Nonnull BiFunction<String, Boolean, CompletableFuture<String>> stopWithSavepointFunction,
		@Nonnull Function<JobVertexID, CompletableFuture<OperatorBackPressureStatsResponse>> requestOperatorBackPressureStatsFunction,
		@Nonnull BiConsumer<AllocationID, Throwable> notifyAllocationFailureConsumer,
		@Nonnull Consumer<Tuple5<JobID, ExecutionAttemptID, Long, CheckpointMetrics, TaskStateSnapshot>> acknowledgeCheckpointConsumer,
		@Nonnull Consumer<DeclineCheckpoint> declineCheckpointConsumer,
		@Nonnull Supplier<JobMasterId> fencingTokenSupplier,
		@Nonnull BiFunction<JobID, String, CompletableFuture<KvStateLocation>> requestKvStateLocationFunction,
		@Nonnull Function<Tuple6<JobID, JobVertexID, KeyGroupRange, String, KvStateID, InetSocketAddress>, CompletableFuture<Acknowledge>> notifyKvStateRegisteredFunction,
		@Nonnull Function<Tuple4<JobID, JobVertexID, KeyGroupRange, String>, CompletableFuture<Acknowledge>> notifyKvStateUnregisteredFunction,
		@Nonnull TriFunction<String, Object, byte[], CompletableFuture<Object>> updateAggregateFunction,
		@Nonnull TriFunction<ExecutionAttemptID, OperatorID, SerializedValue<OperatorEvent>, CompletableFuture<Acknowledge>> operatorEventSender,
		@Nonnull BiFunction<OperatorID, SerializedValue<CoordinationRequest>, CompletableFuture<CoordinationResponse>> deliverCoordinationRequestFunction) {
	this.address = address;
	this.hostname = hostname;
	this.cancelFunction = cancelFunction;
	this.updateTaskExecutionStateFunction = updateTaskExecutionStateFunction;
	this.requestNextInputSplitFunction = requestNextInputSplitFunction;
	this.requestPartitionStateFunction = requestPartitionStateFunction;
	this.scheduleOrUpdateConsumersFunction = scheduleOrUpdateConsumersFunction;
	this.disconnectTaskManagerFunction = disconnectTaskManagerFunction;
	this.disconnectResourceManagerConsumer = disconnectResourceManagerConsumer;
	this.offerSlotsFunction = offerSlotsFunction;
	this.failSlotConsumer = failSlotConsumer;
	this.registerTaskManagerFunction = registerTaskManagerFunction;
	this.taskManagerHeartbeatConsumer = taskManagerHeartbeatConsumer;
	this.resourceManagerHeartbeatConsumer = resourceManagerHeartbeatConsumer;
	this.requestJobDetailsSupplier = requestJobDetailsSupplier;
	this.requestJobSupplier = requestJobSupplier;
	this.triggerSavepointFunction = triggerSavepointFunction;
	this.stopWithSavepointFunction = stopWithSavepointFunction;
	this.requestOperatorBackPressureStatsFunction = requestOperatorBackPressureStatsFunction;
	this.notifyAllocationFailureConsumer = notifyAllocationFailureConsumer;
	this.acknowledgeCheckpointConsumer = acknowledgeCheckpointConsumer;
	this.declineCheckpointConsumer = declineCheckpointConsumer;
	this.fencingTokenSupplier = fencingTokenSupplier;
	this.requestKvStateLocationFunction = requestKvStateLocationFunction;
	this.notifyKvStateRegisteredFunction = notifyKvStateRegisteredFunction;
	this.notifyKvStateUnregisteredFunction = notifyKvStateUnregisteredFunction;
	this.updateAggregateFunction = updateAggregateFunction;
	this.operatorEventSender = operatorEventSender;
	this.deliverCoordinationRequestFunction = deliverCoordinationRequestFunction;
}
 
Example #12
Source File: TestingJobMasterGatewayBuilder.java    From flink with Apache License 2.0 4 votes vote down vote up
public TestingJobMasterGatewayBuilder setDeclineCheckpointConsumer(Consumer<DeclineCheckpoint> declineCheckpointConsumer) {
	this.declineCheckpointConsumer = declineCheckpointConsumer;
	return this;
}
 
Example #13
Source File: CheckpointCoordinator.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Receives a {@link DeclineCheckpoint} message for a pending checkpoint.
 *
 * @param message Checkpoint decline from the task manager
 */
public void receiveDeclineMessage(DeclineCheckpoint message) {
	if (shutdown || message == null) {
		return;
	}
	if (!job.equals(message.getJob())) {
		throw new IllegalArgumentException("Received DeclineCheckpoint message for job " +
			message.getJob() + " while this coordinator handles job " + job);
	}

	final long checkpointId = message.getCheckpointId();
	final String reason = (message.getReason() != null ? message.getReason().getMessage() : "");

	PendingCheckpoint checkpoint;

	synchronized (lock) {
		// we need to check inside the lock for being shutdown as well, otherwise we
		// get races and invalid error log messages
		if (shutdown) {
			return;
		}

		checkpoint = pendingCheckpoints.remove(checkpointId);

		if (checkpoint != null && !checkpoint.isDiscarded()) {
			LOG.info("Decline checkpoint {} by task {} of job {}.", checkpointId, message.getTaskExecutionId(), job);
			discardCheckpoint(checkpoint, message.getReason());
		}
		else if (checkpoint != null) {
			// this should not happen
			throw new IllegalStateException(
					"Received message for discarded but non-removed checkpoint " + checkpointId);
		}
		else if (LOG.isDebugEnabled()) {
			if (recentPendingCheckpoints.contains(checkpointId)) {
				// message is for an unknown checkpoint, or comes too late (checkpoint disposed)
				LOG.debug("Received another decline message for now expired checkpoint attempt {} of job {} : {}",
						checkpointId, job, reason);
			} else {
				// message is for an unknown checkpoint. might be so old that we don't even remember it any more
				LOG.debug("Received decline message for unknown (too old?) checkpoint attempt {} of job {} : {}",
						checkpointId, job, reason);
			}
		}
	}
}
 
Example #14
Source File: CheckpointCoordinatorTest.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testSavepointScheduledInUnalignedMode() throws Exception {
	int maxConcurrentCheckpoints = 1;
	int checkpointRequestsToSend = 10;
	int activeRequests = 0;
	JobID jobId = new JobID();
	CheckpointCoordinator coordinator = new CheckpointCoordinatorBuilder()
		.setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration
			.builder()
			.setUnalignedCheckpointsEnabled(true)
			.setMaxConcurrentCheckpoints(maxConcurrentCheckpoints)
			.build())
		.setJobId(jobId)
		.setTimer(manuallyTriggeredScheduledExecutor)
		.build();
	try {
		List<Future<?>> checkpointFutures = new ArrayList<>(checkpointRequestsToSend);
		coordinator.startCheckpointScheduler();
		while (activeRequests < checkpointRequestsToSend) {
			checkpointFutures.add(coordinator.triggerCheckpoint(true));
			activeRequests++;
		}
		assertEquals(activeRequests - maxConcurrentCheckpoints, coordinator.getNumQueuedRequests());

		Future<?> savepointFuture = coordinator.triggerSavepoint("/tmp");
		manuallyTriggeredScheduledExecutor.triggerAll();
		assertEquals(++activeRequests - maxConcurrentCheckpoints, coordinator.getNumQueuedRequests());

		coordinator.receiveDeclineMessage(new DeclineCheckpoint(jobId, new ExecutionAttemptID(), 1L), "none");
		manuallyTriggeredScheduledExecutor.triggerAll();

		activeRequests--; // savepoint triggered
		assertEquals(activeRequests - maxConcurrentCheckpoints , coordinator.getNumQueuedRequests());
		assertEquals(1, checkpointFutures.stream().filter(Future::isDone).count());

		assertFalse(savepointFuture.isDone());
		assertEquals(maxConcurrentCheckpoints, coordinator.getNumberOfPendingCheckpoints());
		CheckpointProperties props = coordinator.getPendingCheckpoints().values().iterator().next().getProps();
		assertTrue(props.isSavepoint());
		assertFalse(props.forceCheckpoint());
	} finally {
		coordinator.shutdown(JobStatus.FINISHED);
	}
}
 
Example #15
Source File: CheckpointCoordinatorTest.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * This test triggers a checkpoint and then sends a decline checkpoint message from
 * one of the tasks. The expected behaviour is that said checkpoint is discarded and a new
 * checkpoint is triggered.
 */
@Test
public void testTriggerAndDeclineCheckpointSimple() {
	try {
		final JobID jid = new JobID();

		// create some mock Execution vertices that receive the checkpoint trigger messages
		final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
		final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
		ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
		ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);

		// set up the coordinator and validate the initial state
		CheckpointCoordinator coord = getCheckpointCoordinator(jid, vertex1, vertex2);

		assertEquals(0, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// trigger the first checkpoint. this should succeed
		final CompletableFuture<CompletedCheckpoint> checkpointFuture = coord.triggerCheckpoint(false);
		manuallyTriggeredScheduledExecutor.triggerAll();
		assertFalse(checkpointFuture.isCompletedExceptionally());

		// validate that we have a pending checkpoint
		assertEquals(1, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// we have one task scheduled that will cancel after timeout
		assertEquals(1, manuallyTriggeredScheduledExecutor.getScheduledTasks().size());

		long checkpointId = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
		PendingCheckpoint checkpoint = coord.getPendingCheckpoints().get(checkpointId);

		assertNotNull(checkpoint);
		assertEquals(checkpointId, checkpoint.getCheckpointId());
		assertEquals(jid, checkpoint.getJobId());
		assertEquals(2, checkpoint.getNumberOfNonAcknowledgedTasks());
		assertEquals(0, checkpoint.getNumberOfAcknowledgedTasks());
		assertEquals(0, checkpoint.getOperatorStates().size());
		assertFalse(checkpoint.isDiscarded());
		assertFalse(checkpoint.areTasksFullyAcknowledged());

		// check that the vertices received the trigger checkpoint message
		verify(vertex1.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, checkpoint.getCheckpointTimestamp(), CheckpointOptions.forCheckpointWithDefaultLocation());
		verify(vertex2.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, checkpoint.getCheckpointTimestamp(), CheckpointOptions.forCheckpointWithDefaultLocation());

		// acknowledge from one of the tasks
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId), "Unknown location");
		assertEquals(1, checkpoint.getNumberOfAcknowledgedTasks());
		assertEquals(1, checkpoint.getNumberOfNonAcknowledgedTasks());
		assertFalse(checkpoint.isDiscarded());
		assertFalse(checkpoint.areTasksFullyAcknowledged());

		// acknowledge the same task again (should not matter)
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId), "Unknown location");
		assertFalse(checkpoint.isDiscarded());
		assertFalse(checkpoint.areTasksFullyAcknowledged());

		// decline checkpoint from the other task, this should cancel the checkpoint
		// and trigger a new one
		coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);
		assertTrue(checkpoint.isDiscarded());

		// the canceler is also removed
		assertEquals(0, manuallyTriggeredScheduledExecutor.getScheduledTasks().size());

		// validate that we have no new pending checkpoint
		assertEquals(0, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// decline again, nothing should happen
		// decline from the other task, nothing should happen
		coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);
		coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID2, checkpointId), TASK_MANAGER_LOCATION_INFO);
		assertTrue(checkpoint.isDiscarded());

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}
 
Example #16
Source File: TestingJobMasterGatewayBuilder.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
public TestingJobMasterGatewayBuilder setDeclineCheckpointConsumer(Consumer<DeclineCheckpoint> declineCheckpointConsumer) {
	this.declineCheckpointConsumer = declineCheckpointConsumer;
	return this;
}
 
Example #17
Source File: JobMaster.java    From flink with Apache License 2.0 4 votes vote down vote up
@Override
public void declineCheckpoint(DeclineCheckpoint decline) {
	schedulerNG.declineCheckpoint(decline);
}
 
Example #18
Source File: CheckpointCoordinator.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Receives a {@link DeclineCheckpoint} message for a pending checkpoint.
 *
 * @param message Checkpoint decline from the task manager
 * @param taskManagerLocationInfo The location info of the decline checkpoint message's sender
 */
public void receiveDeclineMessage(DeclineCheckpoint message, String taskManagerLocationInfo) {
	if (shutdown || message == null) {
		return;
	}

	if (!job.equals(message.getJob())) {
		throw new IllegalArgumentException("Received DeclineCheckpoint message for job " +
			message.getJob() + " from " + taskManagerLocationInfo + " while this coordinator handles job " + job);
	}

	final long checkpointId = message.getCheckpointId();
	final String reason = (message.getReason() != null ? message.getReason().getMessage() : "");

	PendingCheckpoint checkpoint;

	synchronized (lock) {
		// we need to check inside the lock for being shutdown as well, otherwise we
		// get races and invalid error log messages
		if (shutdown) {
			return;
		}

		checkpoint = pendingCheckpoints.get(checkpointId);

		if (checkpoint != null) {
			Preconditions.checkState(
				!checkpoint.isDiscarded(),
				"Received message for discarded but non-removed checkpoint " + checkpointId);
			LOG.info("Decline checkpoint {} by task {} of job {} at {}.",
				checkpointId,
				message.getTaskExecutionId(),
				job,
				taskManagerLocationInfo);
			final CheckpointException checkpointException;
			if (message.getReason() == null) {
				checkpointException =
					new CheckpointException(CheckpointFailureReason.CHECKPOINT_DECLINED);
			} else {
				checkpointException = getCheckpointException(
					CheckpointFailureReason.JOB_FAILURE, message.getReason());
			}
			abortPendingCheckpoint(
				checkpoint,
				checkpointException,
				message.getTaskExecutionId());
		} else if (LOG.isDebugEnabled()) {
			if (recentPendingCheckpoints.contains(checkpointId)) {
				// message is for an unknown checkpoint, or comes too late (checkpoint disposed)
				LOG.debug("Received another decline message for now expired checkpoint attempt {} from task {} of job {} at {} : {}",
						checkpointId, message.getTaskExecutionId(), job, taskManagerLocationInfo, reason);
			} else {
				// message is for an unknown checkpoint. might be so old that we don't even remember it any more
				LOG.debug("Received decline message for unknown (too old?) checkpoint attempt {} from task {} of job {} at {} : {}",
						checkpointId, message.getTaskExecutionId(), job, taskManagerLocationInfo, reason);
			}
		}
	}
}
 
Example #19
Source File: TestingJobMasterGateway.java    From flink with Apache License 2.0 4 votes vote down vote up
public TestingJobMasterGateway(
		@Nonnull String address,
		@Nonnull String hostname,
		@Nonnull Supplier<CompletableFuture<Acknowledge>> cancelFunction,
		@Nonnull Function<TaskExecutionState, CompletableFuture<Acknowledge>> updateTaskExecutionStateFunction,
		@Nonnull BiFunction<JobVertexID, ExecutionAttemptID, CompletableFuture<SerializedInputSplit>> requestNextInputSplitFunction,
		@Nonnull BiFunction<IntermediateDataSetID, ResultPartitionID, CompletableFuture<ExecutionState>> requestPartitionStateFunction,
		@Nonnull Function<ResultPartitionID, CompletableFuture<Acknowledge>> scheduleOrUpdateConsumersFunction,
		@Nonnull Function<ResourceID, CompletableFuture<Acknowledge>> disconnectTaskManagerFunction,
		@Nonnull Consumer<ResourceManagerId> disconnectResourceManagerConsumer,
		@Nonnull Supplier<CompletableFuture<ClassloadingProps>> classloadingPropsSupplier,
		@Nonnull BiFunction<ResourceID, Collection<SlotOffer>, CompletableFuture<Collection<SlotOffer>>> offerSlotsFunction,
		@Nonnull TriConsumer<ResourceID, AllocationID, Throwable> failSlotConsumer,
		@Nonnull BiFunction<String, TaskManagerLocation, CompletableFuture<RegistrationResponse>> registerTaskManagerFunction,
		@Nonnull BiConsumer<ResourceID, AccumulatorReport> taskManagerHeartbeatConsumer,
		@Nonnull Consumer<ResourceID> resourceManagerHeartbeatConsumer,
		@Nonnull Supplier<CompletableFuture<JobDetails>> requestJobDetailsSupplier,
		@Nonnull Supplier<CompletableFuture<ArchivedExecutionGraph>> requestJobSupplier,
		@Nonnull BiFunction<String, Boolean, CompletableFuture<String>> triggerSavepointFunction,
		@Nonnull BiFunction<String, Boolean, CompletableFuture<String>> stopWithSavepointFunction,
		@Nonnull Function<JobVertexID, CompletableFuture<OperatorBackPressureStatsResponse>> requestOperatorBackPressureStatsFunction,
		@Nonnull BiConsumer<AllocationID, Throwable> notifyAllocationFailureConsumer,
		@Nonnull Consumer<Tuple5<JobID, ExecutionAttemptID, Long, CheckpointMetrics, TaskStateSnapshot>> acknowledgeCheckpointConsumer,
		@Nonnull Consumer<DeclineCheckpoint> declineCheckpointConsumer,
		@Nonnull Supplier<JobMasterId> fencingTokenSupplier,
		@Nonnull BiFunction<JobID, String, CompletableFuture<KvStateLocation>> requestKvStateLocationFunction,
		@Nonnull Function<Tuple6<JobID, JobVertexID, KeyGroupRange, String, KvStateID, InetSocketAddress>, CompletableFuture<Acknowledge>> notifyKvStateRegisteredFunction,
		@Nonnull Function<Tuple4<JobID, JobVertexID, KeyGroupRange, String>, CompletableFuture<Acknowledge>> notifyKvStateUnregisteredFunction,
		@Nonnull TriFunction<String, Object, byte[], CompletableFuture<Object>> updateAggregateFunction) {
	this.address = address;
	this.hostname = hostname;
	this.cancelFunction = cancelFunction;
	this.updateTaskExecutionStateFunction = updateTaskExecutionStateFunction;
	this.requestNextInputSplitFunction = requestNextInputSplitFunction;
	this.requestPartitionStateFunction = requestPartitionStateFunction;
	this.scheduleOrUpdateConsumersFunction = scheduleOrUpdateConsumersFunction;
	this.disconnectTaskManagerFunction = disconnectTaskManagerFunction;
	this.disconnectResourceManagerConsumer = disconnectResourceManagerConsumer;
	this.classloadingPropsSupplier = classloadingPropsSupplier;
	this.offerSlotsFunction = offerSlotsFunction;
	this.failSlotConsumer = failSlotConsumer;
	this.registerTaskManagerFunction = registerTaskManagerFunction;
	this.taskManagerHeartbeatConsumer = taskManagerHeartbeatConsumer;
	this.resourceManagerHeartbeatConsumer = resourceManagerHeartbeatConsumer;
	this.requestJobDetailsSupplier = requestJobDetailsSupplier;
	this.requestJobSupplier = requestJobSupplier;
	this.triggerSavepointFunction = triggerSavepointFunction;
	this.stopWithSavepointFunction = stopWithSavepointFunction;
	this.requestOperatorBackPressureStatsFunction = requestOperatorBackPressureStatsFunction;
	this.notifyAllocationFailureConsumer = notifyAllocationFailureConsumer;
	this.acknowledgeCheckpointConsumer = acknowledgeCheckpointConsumer;
	this.declineCheckpointConsumer = declineCheckpointConsumer;
	this.fencingTokenSupplier = fencingTokenSupplier;
	this.requestKvStateLocationFunction = requestKvStateLocationFunction;
	this.notifyKvStateRegisteredFunction = notifyKvStateRegisteredFunction;
	this.notifyKvStateUnregisteredFunction = notifyKvStateUnregisteredFunction;
	this.updateAggregateFunction = updateAggregateFunction;
}
 
Example #20
Source File: TestingJobMasterGatewayBuilder.java    From flink with Apache License 2.0 4 votes vote down vote up
public TestingJobMasterGatewayBuilder setDeclineCheckpointConsumer(Consumer<DeclineCheckpoint> declineCheckpointConsumer) {
	this.declineCheckpointConsumer = declineCheckpointConsumer;
	return this;
}
 
Example #21
Source File: TestingJobMasterGateway.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
public TestingJobMasterGateway(
		@Nonnull String address,
		@Nonnull String hostname,
		@Nonnull Supplier<CompletableFuture<Acknowledge>> cancelFunction,
		@Nonnull Supplier<CompletableFuture<Acknowledge>> stopFunction,
		@Nonnull BiFunction<Integer, RescalingBehaviour, CompletableFuture<Acknowledge>> rescalingJobFunction,
		@Nonnull TriFunction<Collection<JobVertexID>, Integer, RescalingBehaviour, CompletableFuture<Acknowledge>> rescalingOperatorsFunction,
		@Nonnull Function<TaskExecutionState, CompletableFuture<Acknowledge>> updateTaskExecutionStateFunction,
		@Nonnull BiFunction<JobVertexID, ExecutionAttemptID, CompletableFuture<SerializedInputSplit>> requestNextInputSplitFunction,
		@Nonnull BiFunction<IntermediateDataSetID, ResultPartitionID, CompletableFuture<ExecutionState>> requestPartitionStateFunction,
		@Nonnull Function<ResultPartitionID, CompletableFuture<Acknowledge>> scheduleOrUpdateConsumersFunction,
		@Nonnull Function<ResourceID, CompletableFuture<Acknowledge>> disconnectTaskManagerFunction,
		@Nonnull Consumer<ResourceManagerId> disconnectResourceManagerConsumer,
		@Nonnull Supplier<CompletableFuture<ClassloadingProps>> classloadingPropsSupplier,
		@Nonnull BiFunction<ResourceID, Collection<SlotOffer>, CompletableFuture<Collection<SlotOffer>>> offerSlotsFunction,
		@Nonnull TriConsumer<ResourceID, AllocationID, Throwable> failSlotConsumer,
		@Nonnull BiFunction<String, TaskManagerLocation, CompletableFuture<RegistrationResponse>> registerTaskManagerFunction,
		@Nonnull BiConsumer<ResourceID, AccumulatorReport> taskManagerHeartbeatConsumer,
		@Nonnull Consumer<ResourceID> resourceManagerHeartbeatConsumer,
		@Nonnull Supplier<CompletableFuture<JobDetails>> requestJobDetailsSupplier,
		@Nonnull Supplier<CompletableFuture<ArchivedExecutionGraph>> requestJobSupplier,
		@Nonnull BiFunction<String, Boolean, CompletableFuture<String>> triggerSavepointFunction,
		@Nonnull Function<JobVertexID, CompletableFuture<OperatorBackPressureStatsResponse>> requestOperatorBackPressureStatsFunction,
		@Nonnull BiConsumer<AllocationID, Throwable> notifyAllocationFailureConsumer,
		@Nonnull Consumer<Tuple5<JobID, ExecutionAttemptID, Long, CheckpointMetrics, TaskStateSnapshot>> acknowledgeCheckpointConsumer,
		@Nonnull Consumer<DeclineCheckpoint> declineCheckpointConsumer,
		@Nonnull Supplier<JobMasterId> fencingTokenSupplier,
		@Nonnull BiFunction<JobID, String, CompletableFuture<KvStateLocation>> requestKvStateLocationFunction,
		@Nonnull Function<Tuple6<JobID, JobVertexID, KeyGroupRange, String, KvStateID, InetSocketAddress>, CompletableFuture<Acknowledge>> notifyKvStateRegisteredFunction,
		@Nonnull Function<Tuple4<JobID, JobVertexID, KeyGroupRange, String>, CompletableFuture<Acknowledge>> notifyKvStateUnregisteredFunction,
		@Nonnull TriFunction<String, Object, byte[], CompletableFuture<Object>> updateAggregateFunction) {
	this.address = address;
	this.hostname = hostname;
	this.cancelFunction = cancelFunction;
	this.stopFunction = stopFunction;
	this.rescalingJobFunction = rescalingJobFunction;
	this.rescalingOperatorsFunction = rescalingOperatorsFunction;
	this.updateTaskExecutionStateFunction = updateTaskExecutionStateFunction;
	this.requestNextInputSplitFunction = requestNextInputSplitFunction;
	this.requestPartitionStateFunction = requestPartitionStateFunction;
	this.scheduleOrUpdateConsumersFunction = scheduleOrUpdateConsumersFunction;
	this.disconnectTaskManagerFunction = disconnectTaskManagerFunction;
	this.disconnectResourceManagerConsumer = disconnectResourceManagerConsumer;
	this.classloadingPropsSupplier = classloadingPropsSupplier;
	this.offerSlotsFunction = offerSlotsFunction;
	this.failSlotConsumer = failSlotConsumer;
	this.registerTaskManagerFunction = registerTaskManagerFunction;
	this.taskManagerHeartbeatConsumer = taskManagerHeartbeatConsumer;
	this.resourceManagerHeartbeatConsumer = resourceManagerHeartbeatConsumer;
	this.requestJobDetailsSupplier = requestJobDetailsSupplier;
	this.requestJobSupplier = requestJobSupplier;
	this.triggerSavepointFunction = triggerSavepointFunction;
	this.requestOperatorBackPressureStatsFunction = requestOperatorBackPressureStatsFunction;
	this.notifyAllocationFailureConsumer = notifyAllocationFailureConsumer;
	this.acknowledgeCheckpointConsumer = acknowledgeCheckpointConsumer;
	this.declineCheckpointConsumer = declineCheckpointConsumer;
	this.fencingTokenSupplier = fencingTokenSupplier;
	this.requestKvStateLocationFunction = requestKvStateLocationFunction;
	this.notifyKvStateRegisteredFunction = notifyKvStateRegisteredFunction;
	this.notifyKvStateUnregisteredFunction = notifyKvStateUnregisteredFunction;
	this.updateAggregateFunction = updateAggregateFunction;
}
 
Example #22
Source File: CheckpointCoordinatorTest.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * This test triggers a checkpoint and then sends a decline checkpoint message from
 * one of the tasks. The expected behaviour is that said checkpoint is discarded and a new
 * checkpoint is triggered.
 */
@Test
public void testTriggerAndDeclineCheckpointSimple() {
	try {
		final JobID jid = new JobID();
		final long timestamp = System.currentTimeMillis();

		// create some mock Execution vertices that receive the checkpoint trigger messages
		final ExecutionAttemptID attemptID1 = new ExecutionAttemptID();
		final ExecutionAttemptID attemptID2 = new ExecutionAttemptID();
		ExecutionVertex vertex1 = mockExecutionVertex(attemptID1);
		ExecutionVertex vertex2 = mockExecutionVertex(attemptID2);

		// set up the coordinator and validate the initial state
		CheckpointCoordinator coord = getCheckpointCoordinator(jid, vertex1, vertex2, failureManager);

		assertEquals(0, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// trigger the first checkpoint. this should succeed
		assertTrue(coord.triggerCheckpoint(timestamp, false));

		// validate that we have a pending checkpoint
		assertEquals(1, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// we have one task scheduled that will cancel after timeout
		assertEquals(1, coord.getNumScheduledTasks());

		long checkpointId = coord.getPendingCheckpoints().entrySet().iterator().next().getKey();
		PendingCheckpoint checkpoint = coord.getPendingCheckpoints().get(checkpointId);

		assertNotNull(checkpoint);
		assertEquals(checkpointId, checkpoint.getCheckpointId());
		assertEquals(timestamp, checkpoint.getCheckpointTimestamp());
		assertEquals(jid, checkpoint.getJobId());
		assertEquals(2, checkpoint.getNumberOfNonAcknowledgedTasks());
		assertEquals(0, checkpoint.getNumberOfAcknowledgedTasks());
		assertEquals(0, checkpoint.getOperatorStates().size());
		assertFalse(checkpoint.isDiscarded());
		assertFalse(checkpoint.isFullyAcknowledged());

		// check that the vertices received the trigger checkpoint message
		verify(vertex1.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, timestamp, CheckpointOptions.forCheckpointWithDefaultLocation());
		verify(vertex2.getCurrentExecutionAttempt()).triggerCheckpoint(checkpointId, timestamp, CheckpointOptions.forCheckpointWithDefaultLocation());

		// acknowledge from one of the tasks
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId), "Unknown location");
		assertEquals(1, checkpoint.getNumberOfAcknowledgedTasks());
		assertEquals(1, checkpoint.getNumberOfNonAcknowledgedTasks());
		assertFalse(checkpoint.isDiscarded());
		assertFalse(checkpoint.isFullyAcknowledged());

		// acknowledge the same task again (should not matter)
		coord.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(jid, attemptID2, checkpointId), "Unknown location");
		assertFalse(checkpoint.isDiscarded());
		assertFalse(checkpoint.isFullyAcknowledged());


		// decline checkpoint from the other task, this should cancel the checkpoint
		// and trigger a new one
		coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);
		assertTrue(checkpoint.isDiscarded());

		// the canceler is also removed
		assertEquals(0, coord.getNumScheduledTasks());

		// validate that we have no new pending checkpoint
		assertEquals(0, coord.getNumberOfPendingCheckpoints());
		assertEquals(0, coord.getNumberOfRetainedSuccessfulCheckpoints());

		// decline again, nothing should happen
		// decline from the other task, nothing should happen
		coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);
		coord.receiveDeclineMessage(new DeclineCheckpoint(jid, attemptID2, checkpointId), TASK_MANAGER_LOCATION_INFO);
		assertTrue(checkpoint.isDiscarded());

		coord.shutdown(JobStatus.FINISHED);
	}
	catch (Exception e) {
		e.printStackTrace();
		fail(e.getMessage());
	}
}
 
Example #23
Source File: CheckpointCoordinator.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Receives a {@link DeclineCheckpoint} message for a pending checkpoint.
 *
 * @param message Checkpoint decline from the task manager
 * @param taskManagerLocationInfo The location info of the decline checkpoint message's sender
 */
public void receiveDeclineMessage(DeclineCheckpoint message, String taskManagerLocationInfo) {
	if (shutdown || message == null) {
		return;
	}

	if (!job.equals(message.getJob())) {
		throw new IllegalArgumentException("Received DeclineCheckpoint message for job " +
			message.getJob() + " from " + taskManagerLocationInfo + " while this coordinator handles job " + job);
	}

	final long checkpointId = message.getCheckpointId();
	final String reason = (message.getReason() != null ? message.getReason().getMessage() : "");

	PendingCheckpoint checkpoint;

	synchronized (lock) {
		// we need to check inside the lock for being shutdown as well, otherwise we
		// get races and invalid error log messages
		if (shutdown) {
			return;
		}

		checkpoint = pendingCheckpoints.remove(checkpointId);

		if (checkpoint != null && !checkpoint.isDiscarded()) {
			LOG.info("Decline checkpoint {} by task {} of job {} at {}.",
				checkpointId,
				message.getTaskExecutionId(),
				job,
				taskManagerLocationInfo);
			discardCheckpoint(checkpoint, message.getReason(), message.getTaskExecutionId());
		}
		else if (checkpoint != null) {
			// this should not happen
			throw new IllegalStateException(
					"Received message for discarded but non-removed checkpoint " + checkpointId);
		}
		else if (LOG.isDebugEnabled()) {
			if (recentPendingCheckpoints.contains(checkpointId)) {
				// message is for an unknown checkpoint, or comes too late (checkpoint disposed)
				LOG.debug("Received another decline message for now expired checkpoint attempt {} from task {} of job {} at {} : {}",
						checkpointId, message.getTaskExecutionId(), job, taskManagerLocationInfo, reason);
			} else {
				// message is for an unknown checkpoint. might be so old that we don't even remember it any more
				LOG.debug("Received decline message for unknown (too old?) checkpoint attempt {} from task {} of job {} at {} : {}",
						checkpointId, message.getTaskExecutionId(), job, taskManagerLocationInfo, reason);
			}
		}
	}
}
 
Example #24
Source File: SchedulerNG.java    From flink with Apache License 2.0 votes vote down vote up
void declineCheckpoint(DeclineCheckpoint decline); 
Example #25
Source File: SchedulerNG.java    From flink with Apache License 2.0 votes vote down vote up
void declineCheckpoint(DeclineCheckpoint decline);