Java Code Examples for org.apache.flink.runtime.jobgraph.JobGraph#getJobID()

The following examples show how to use org.apache.flink.runtime.jobgraph.JobGraph#getJobID() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SavepointITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
private String submitJobAndTakeSavepoint(MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		StatefulCounter.getProgressLatch().await();

		return client.cancelWithSavepoint(jobId, null);
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}
 
Example 2
Source File: SavepointITCase.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
private String submitJobAndTakeSavepoint(MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		StatefulCounter.getProgressLatch().await();

		return client.cancelWithSavepoint(jobId, null);
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}
 
Example 3
Source File: Dispatcher.java    From flink with Apache License 2.0 6 votes vote down vote up
private CompletableFuture<Boolean> tryRunRecoveredJobGraph(JobGraph jobGraph, DispatcherId dispatcherId) throws Exception {
	if (leaderElectionService.hasLeadership(dispatcherId.toUUID())) {
		final JobID jobId = jobGraph.getJobID();
		if (jobManagerRunnerFutures.containsKey(jobId)) {
			// we must not release the job graph lock since it can only be locked once and
			// is currently being executed. Once we support multiple locks, we must release
			// the JobGraph here
			log.debug("Ignore added JobGraph because the job {} is already running.", jobId);
			return CompletableFuture.completedFuture(true);
		} else if (runningJobsRegistry.getJobSchedulingStatus(jobId) != RunningJobsRegistry.JobSchedulingStatus.DONE) {
			return waitForTerminatingJobManager(jobId, jobGraph, this::runJob).thenApply(ignored -> true);
		} else {
			log.debug("Ignore added JobGraph because the job {} has already been completed.", jobId);
		}
	}

	return CompletableFuture.completedFuture(false);
}
 
Example 4
Source File: DefaultSchedulerTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void cancelWhileRestartingShouldWaitForRunningTasks() {
	final JobGraph jobGraph = singleJobVertexJobGraph(2);
	final JobID jobid = jobGraph.getJobID();
	final DefaultScheduler scheduler = createSchedulerAndStartScheduling(jobGraph);
	final SchedulingTopology topology = scheduler.getSchedulingTopology();

	final Iterator<ArchivedExecutionVertex> vertexIterator = scheduler.requestJob().getAllExecutionVertices().iterator();
	final ExecutionAttemptID attemptId1 = vertexIterator.next().getCurrentExecutionAttempt().getAttemptId();
	final ExecutionAttemptID attemptId2 = vertexIterator.next().getCurrentExecutionAttempt().getAttemptId();
	final ExecutionVertexID executionVertex2 = scheduler.getExecutionVertexIdOrThrow(attemptId2);

	scheduler.updateTaskExecutionState(new TaskExecutionState(jobid, attemptId1, ExecutionState.FAILED, new RuntimeException("expected")));
	scheduler.cancel();
	final ExecutionState vertex2StateAfterCancel = topology.getVertex(executionVertex2).getState();
	final JobStatus statusAfterCancelWhileRestarting = scheduler.requestJobStatus();
	scheduler.updateTaskExecutionState(new TaskExecutionState(jobid, attemptId2, ExecutionState.CANCELED, new RuntimeException("expected")));

	assertThat(vertex2StateAfterCancel, is(equalTo(ExecutionState.CANCELING)));
	assertThat(statusAfterCancelWhileRestarting, is(equalTo(JobStatus.CANCELLING)));
	assertThat(scheduler.requestJobStatus(), is(equalTo(JobStatus.CANCELED)));
}
 
Example 5
Source File: Dispatcher.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
private CompletableFuture<Boolean> tryRunRecoveredJobGraph(JobGraph jobGraph, DispatcherId dispatcherId) throws Exception {
	if (leaderElectionService.hasLeadership(dispatcherId.toUUID())) {
		final JobID jobId = jobGraph.getJobID();
		if (jobManagerRunnerFutures.containsKey(jobId)) {
			// we must not release the job graph lock since it can only be locked once and
			// is currently being executed. Once we support multiple locks, we must release
			// the JobGraph here
			log.debug("Ignore added JobGraph because the job {} is already running.", jobId);
			return CompletableFuture.completedFuture(true);
		} else if (runningJobsRegistry.getJobSchedulingStatus(jobId) != RunningJobsRegistry.JobSchedulingStatus.DONE) {
			return waitForTerminatingJobManager(jobId, jobGraph, this::runJob).thenApply(ignored -> true);
		} else {
			log.debug("Ignore added JobGraph because the job {} has already been completed.", jobId);
		}
	}

	return CompletableFuture.completedFuture(false);
}
 
Example 6
Source File: DefaultSchedulerTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void jobStatusIsRestartingIfOneVertexIsWaitingForRestart() {
	final JobGraph jobGraph = singleJobVertexJobGraph(2);
	final JobID jobId = jobGraph.getJobID();
	final DefaultScheduler scheduler = createSchedulerAndStartScheduling(jobGraph);

	final Iterator<ArchivedExecutionVertex> vertexIterator = scheduler.requestJob().getAllExecutionVertices().iterator();
	final ExecutionAttemptID attemptId1 = vertexIterator.next().getCurrentExecutionAttempt().getAttemptId();
	final ExecutionAttemptID attemptId2 = vertexIterator.next().getCurrentExecutionAttempt().getAttemptId();

	scheduler.updateTaskExecutionState(new TaskExecutionState(jobId, attemptId1, ExecutionState.FAILED, new RuntimeException("expected")));
	final JobStatus jobStatusAfterFirstFailure = scheduler.requestJobStatus();
	scheduler.updateTaskExecutionState(new TaskExecutionState(jobId, attemptId2, ExecutionState.FAILED, new RuntimeException("expected")));

	taskRestartExecutor.triggerNonPeriodicScheduledTask();
	final JobStatus jobStatusWithPendingRestarts = scheduler.requestJobStatus();
	taskRestartExecutor.triggerNonPeriodicScheduledTask();
	final JobStatus jobStatusAfterRestarts = scheduler.requestJobStatus();

	assertThat(jobStatusAfterFirstFailure, equalTo(JobStatus.RESTARTING));
	assertThat(jobStatusWithPendingRestarts, equalTo(JobStatus.RESTARTING));
	assertThat(jobStatusAfterRestarts, equalTo(JobStatus.RUNNING));
}
 
Example 7
Source File: AdaptedRestartPipelinedRegionStrategyNGFailoverTest.java    From flink with Apache License 2.0 6 votes vote down vote up
private ExecutionGraph createExecutionGraph(
		final JobGraph jobGraph,
		final RestartStrategy restartStrategy) throws Exception {

	final PartitionTracker partitionTracker = new PartitionTrackerImpl(
		jobGraph.getJobID(),
		NettyShuffleMaster.INSTANCE,
		ignored -> Optional.empty());

	final ExecutionGraph eg = new ExecutionGraphTestUtils.TestingExecutionGraphBuilder(jobGraph)
		.setRestartStrategy(restartStrategy)
		.setFailoverStrategyFactory(TestAdaptedRestartPipelinedRegionStrategyNG::new)
		.setPartitionTracker(partitionTracker)
		.build();

	eg.start(componentMainThreadExecutor);
	eg.scheduleForExecution();
	manualMainThreadExecutor.triggerAll();

	return eg;
}
 
Example 8
Source File: SavepointReaderKeyedStateITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private String takeSavepoint(JobGraph jobGraph) throws Exception {
	SavepointSource.initializeForTest();

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	JobID jobId = jobGraph.getJobID();

	Deadline deadline = Deadline.fromNow(Duration.ofMinutes(5));

	String dirPath = getTempDirPath(new AbstractID().toHexString());

	try {
		client.setDetached(true);
		JobSubmissionResult result = client.submitJob(jobGraph, getClass().getClassLoader());

		boolean finished = false;
		while (deadline.hasTimeLeft()) {
			if (SavepointSource.isFinished()) {
				finished = true;

				break;
			}
		}

		if (!finished) {
			Assert.fail("Failed to initialize state within deadline");
		}

		CompletableFuture<String> path = client.triggerSavepoint(result.getJobID(), dirPath);
		return path.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
	} finally {
		client.cancel(jobId);
	}
}
 
Example 9
Source File: SavepointITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		// Await state is restored
		StatefulCounter.getRestoreLatch().await();

		// Await some progress after restore
		StatefulCounter.getProgressLatch().await();

		client.cancel(jobId);

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobId),
			Time.milliseconds(50),
			Deadline.now().plus(Duration.ofSeconds(30)),
			status -> status == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor()
		);

		client.disposeSavepoint(savepointPath)
			.get();

		assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists());
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}
 
Example 10
Source File: SavepointReaderITTestBase.java    From flink with Apache License 2.0 5 votes vote down vote up
private String takeSavepoint(JobGraph jobGraph) throws Exception {
	SavepointSource.initializeForTest();

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	JobID jobId = jobGraph.getJobID();

	Deadline deadline = Deadline.fromNow(Duration.ofMinutes(5));

	String dirPath = getTempDirPath(new AbstractID().toHexString());

	try {
		JobSubmissionResult result = ClientUtils.submitJob(client, jobGraph);

		boolean finished = false;
		while (deadline.hasTimeLeft()) {
			if (SavepointSource.isFinished()) {
				finished = true;

				break;
			}

			try {
				Thread.sleep(2L);
			} catch (InterruptedException ignored) {
				Thread.currentThread().interrupt();
			}
		}

		if (!finished) {
			Assert.fail("Failed to initialize state within deadline");
		}

		CompletableFuture<String> path = client.triggerSavepoint(result.getJobID(), dirPath);
		return path.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
	} finally {
		client.cancel(jobId).get();
	}
}
 
Example 11
Source File: MiniCluster.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * This method executes a job in detached mode. The method returns immediately after the job
 * has been added to the
 *
 * @param job  The Flink job to execute
 *
 * @throws JobExecutionException Thrown if anything went amiss during initial job launch,
 *         or if the job terminally failed.
 */
public void runDetached(JobGraph job) throws JobExecutionException, InterruptedException {
	checkNotNull(job, "job is null");

	final CompletableFuture<JobSubmissionResult> submissionFuture = submitJob(job);

	try {
		submissionFuture.get();
	} catch (ExecutionException e) {
		throw new JobExecutionException(job.getJobID(), ExceptionUtils.stripExecutionException(e));
	}
}
 
Example 12
Source File: SavepointITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		// Await state is restored
		StatefulCounter.getRestoreLatch().await();

		// Await some progress after restore
		StatefulCounter.getProgressLatch().await();

		client.cancel(jobId);

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobId),
			Time.milliseconds(50),
			Deadline.now().plus(Duration.ofSeconds(30)),
			status -> status == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor()
		);

		client.disposeSavepoint(savepointPath)
			.get();

		assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists());
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}
 
Example 13
Source File: TestingJobManagerRunnerFactory.java    From flink with Apache License 2.0 5 votes vote down vote up
@Nonnull
private TestingJobManagerRunner createTestingJobManagerRunner(JobGraph jobGraph) {
	final boolean blockingTermination;

	if (numBlockingJobManagerRunners > 0) {
		numBlockingJobManagerRunners--;
		blockingTermination = true;
	} else {
		blockingTermination = false;
	}

	return new TestingJobManagerRunner(jobGraph.getJobID(), blockingTermination);
}
 
Example 14
Source File: JobManagerRunner.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Exceptions that occur while creating the JobManager or JobManagerRunner are directly
 * thrown and not reported to the given {@code FatalErrorHandler}.
 *
 * @throws Exception Thrown if the runner cannot be set up, because either one of the
 *                   required services could not be started, or the Job could not be initialized.
 */
public JobManagerRunner(
		final JobGraph jobGraph,
		final JobMasterServiceFactory jobMasterFactory,
		final HighAvailabilityServices haServices,
		final LibraryCacheManager libraryCacheManager,
		final Executor executor,
		final FatalErrorHandler fatalErrorHandler) throws Exception {

	this.resultFuture = new CompletableFuture<>();
	this.terminationFuture = new CompletableFuture<>();
	this.leadershipOperation = CompletableFuture.completedFuture(null);

	// make sure we cleanly shut down out JobManager services if initialization fails
	try {
		this.jobGraph = checkNotNull(jobGraph);
		this.libraryCacheManager = checkNotNull(libraryCacheManager);
		this.executor = checkNotNull(executor);
		this.fatalErrorHandler = checkNotNull(fatalErrorHandler);

		checkArgument(jobGraph.getNumberOfVertices() > 0, "The given job is empty");

		// libraries and class loader first
		try {
			libraryCacheManager.registerJob(
					jobGraph.getJobID(), jobGraph.getUserJarBlobKeys(), jobGraph.getClasspaths());
		} catch (IOException e) {
			throw new Exception("Cannot set up the user code libraries: " + e.getMessage(), e);
		}

		final ClassLoader userCodeLoader = libraryCacheManager.getClassLoader(jobGraph.getJobID());
		if (userCodeLoader == null) {
			throw new Exception("The user code class loader could not be initialized.");
		}

		// high availability services next
		this.runningJobsRegistry = haServices.getRunningJobsRegistry();
		this.leaderElectionService = haServices.getJobManagerLeaderElectionService(jobGraph.getJobID());

		this.leaderGatewayFuture = new CompletableFuture<>();

		// now start the JobManager
		this.jobMasterService = jobMasterFactory.createJobMasterService(jobGraph, this, userCodeLoader);
	}
	catch (Throwable t) {
		terminationFuture.completeExceptionally(t);
		resultFuture.completeExceptionally(t);

		throw new JobExecutionException(jobGraph.getJobID(), "Could not set up JobManager", t);
	}
}
 
Example 15
Source File: ZooKeeperHADispatcherTest.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that the {@link Dispatcher} releases a locked {@link SubmittedJobGraph} if it
 * lost the leadership.
 */
@Test
public void testSubmittedJobGraphRelease() throws Exception {
	final CuratorFramework client = ZooKeeperUtils.startCuratorFramework(configuration);
	final CuratorFramework otherClient = ZooKeeperUtils.startCuratorFramework(configuration);

	try (final TestingHighAvailabilityServices testingHighAvailabilityServices = new TestingHighAvailabilityServices()) {
		testingHighAvailabilityServices.setSubmittedJobGraphStore(ZooKeeperUtils.createSubmittedJobGraphs(client, configuration));

		final ZooKeeperSubmittedJobGraphStore otherSubmittedJobGraphStore = ZooKeeperUtils.createSubmittedJobGraphs(
			otherClient,
			configuration);

		otherSubmittedJobGraphStore.start(NoOpSubmittedJobGraphListener.INSTANCE);

		final TestingLeaderElectionService leaderElectionService = new TestingLeaderElectionService();
		testingHighAvailabilityServices.setDispatcherLeaderElectionService(leaderElectionService);

		final TestingDispatcher dispatcher = createDispatcher(
			testingHighAvailabilityServices,
			new TestingJobManagerRunnerFactory(new CompletableFuture<>(), new CompletableFuture<>(), CompletableFuture.completedFuture(null)));

		dispatcher.start();

		try {
			final DispatcherId expectedLeaderId = DispatcherId.generate();
			leaderElectionService.isLeader(expectedLeaderId.toUUID()).get();

			final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);

			final JobGraph nonEmptyJobGraph = DispatcherHATest.createNonEmptyJobGraph();
			final CompletableFuture<Acknowledge> submissionFuture = dispatcherGateway.submitJob(nonEmptyJobGraph, TIMEOUT);
			submissionFuture.get();

			Collection<JobID> jobIds = otherSubmittedJobGraphStore.getJobIds();

			final JobID jobId = nonEmptyJobGraph.getJobID();
			assertThat(jobIds, Matchers.contains(jobId));

			leaderElectionService.notLeader();

			// wait for the job to properly terminate
			final CompletableFuture<Void> jobTerminationFuture = dispatcher.getJobTerminationFuture(jobId, TIMEOUT);
			jobTerminationFuture.get();

			// recover the job
			final SubmittedJobGraph submittedJobGraph = otherSubmittedJobGraphStore.recoverJobGraph(jobId);

			assertThat(submittedJobGraph, is(notNullValue()));

			// check that the other submitted job graph store can remove the job graph after the original leader
			// has lost its leadership
			otherSubmittedJobGraphStore.removeJobGraph(jobId);

			jobIds = otherSubmittedJobGraphStore.getJobIds();

			assertThat(jobIds, Matchers.not(Matchers.contains(jobId)));
		} finally {
			RpcUtils.terminateRpcEndpoint(dispatcher, TIMEOUT);
			client.close();
			otherClient.close();
		}
	}
}
 
Example 16
Source File: JobMasterTest.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that an existing checkpoint will have precedence over an savepoint.
 */
@Test
public void testCheckpointPrecedesSavepointRecovery() throws Exception {

	// create savepoint data
	final long savepointId = 42L;
	final File savepointFile = createSavepoint(savepointId);

	// set savepoint settings
	final SavepointRestoreSettings savepointRestoreSettings = SavepointRestoreSettings.forPath("" +
			savepointFile.getAbsolutePath(),
		true);
	final JobGraph jobGraph = createJobGraphWithCheckpointing(savepointRestoreSettings);

	final long checkpointId = 1L;

	final CompletedCheckpoint completedCheckpoint = new CompletedCheckpoint(
		jobGraph.getJobID(),
		checkpointId,
		1L,
		1L,
		Collections.emptyMap(),
		null,
		CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION),
		new DummyCheckpointStorageLocation());

	final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
	completedCheckpointStore.addCheckpoint(completedCheckpoint);
	final TestingCheckpointRecoveryFactory testingCheckpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, new StandaloneCheckpointIDCounter());
	haServices.setCheckpointRecoveryFactory(testingCheckpointRecoveryFactory);

	final JobMaster jobMaster = createJobMaster(
		configuration,
		jobGraph,
		haServices,
		new TestingJobManagerSharedServicesBuilder().build());

	try {
		// starting the JobMaster should have read the savepoint
		final CompletedCheckpoint savepointCheckpoint = completedCheckpointStore.getLatestCheckpoint();

		assertThat(savepointCheckpoint, Matchers.notNullValue());

		assertThat(savepointCheckpoint.getCheckpointID(), is(checkpointId));
	} finally {
		RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
	}
}
 
Example 17
Source File: JobManagerRunner.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Exceptions that occur while creating the JobManager or JobManagerRunner are directly
 * thrown and not reported to the given {@code FatalErrorHandler}.
 *
 * @throws Exception Thrown if the runner cannot be set up, because either one of the
 *                   required services could not be started, or the Job could not be initialized.
 */
public JobManagerRunner(
		final JobGraph jobGraph,
		final JobMasterServiceFactory jobMasterFactory,
		final HighAvailabilityServices haServices,
		final LibraryCacheManager libraryCacheManager,
		final Executor executor,
		final FatalErrorHandler fatalErrorHandler) throws Exception {

	this.resultFuture = new CompletableFuture<>();
	this.terminationFuture = new CompletableFuture<>();
	this.leadershipOperation = CompletableFuture.completedFuture(null);

	// make sure we cleanly shut down out JobManager services if initialization fails
	try {
		this.jobGraph = checkNotNull(jobGraph);
		this.libraryCacheManager = checkNotNull(libraryCacheManager);
		this.executor = checkNotNull(executor);
		this.fatalErrorHandler = checkNotNull(fatalErrorHandler);

		checkArgument(jobGraph.getNumberOfVertices() > 0, "The given job is empty");

		// libraries and class loader first
		try {
			libraryCacheManager.registerJob(
					jobGraph.getJobID(), jobGraph.getUserJarBlobKeys(), jobGraph.getClasspaths());
		} catch (IOException e) {
			throw new Exception("Cannot set up the user code libraries: " + e.getMessage(), e);
		}

		final ClassLoader userCodeLoader = libraryCacheManager.getClassLoader(jobGraph.getJobID());
		if (userCodeLoader == null) {
			throw new Exception("The user code class loader could not be initialized.");
		}

		// high availability services next
		this.runningJobsRegistry = haServices.getRunningJobsRegistry();
		this.leaderElectionService = haServices.getJobManagerLeaderElectionService(jobGraph.getJobID());

		this.leaderGatewayFuture = new CompletableFuture<>();

		// now start the JobManager
		this.jobMasterService = jobMasterFactory.createJobMasterService(jobGraph, this, userCodeLoader);
	}
	catch (Throwable t) {
		terminationFuture.completeExceptionally(t);
		resultFuture.completeExceptionally(t);

		throw new JobExecutionException(jobGraph.getJobID(), "Could not set up JobManager", t);
	}
}
 
Example 18
Source File: DispatcherHATest.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that a Dispatcher does not remove the JobGraph from the submitted job graph store
 * when losing leadership and recovers it when regaining leadership.
 */
@Test
public void testJobRecoveryWhenChangingLeadership() throws Exception {
	final InMemorySubmittedJobGraphStore submittedJobGraphStore = new InMemorySubmittedJobGraphStore();

	final CompletableFuture<JobID> recoveredJobFuture = new CompletableFuture<>();
	submittedJobGraphStore.setRecoverJobGraphFunction((jobID, jobIDSubmittedJobGraphMap) -> {
		recoveredJobFuture.complete(jobID);
		return jobIDSubmittedJobGraphMap.get(jobID);
	});

	final TestingLeaderElectionService leaderElectionService = new TestingLeaderElectionService();

	final TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServicesBuilder()
		.setSubmittedJobGraphStore(submittedJobGraphStore)
		.setDispatcherLeaderElectionService(leaderElectionService)
		.build();

	final ArrayBlockingQueue<DispatcherId> fencingTokens = new ArrayBlockingQueue<>(2);
	final HATestingDispatcher dispatcher = createDispatcherWithObservableFencingTokens(
		highAvailabilityServices,
		fencingTokens);

	dispatcher.start();

	try {
		// grant leadership and submit a single job
		final DispatcherId expectedDispatcherId = DispatcherId.generate();
		leaderElectionService.isLeader(expectedDispatcherId.toUUID()).get();

		assertThat(fencingTokens.take(), is(equalTo(expectedDispatcherId)));

		final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);

		final JobGraph jobGraph = createNonEmptyJobGraph();
		final CompletableFuture<Acknowledge> submissionFuture = dispatcherGateway.submitJob(jobGraph, timeout);

		submissionFuture.get();

		final JobID jobId = jobGraph.getJobID();
		assertThat(submittedJobGraphStore.contains(jobId), is(true));

		// revoke the leadership --> this should stop all running JobManagerRunners
		leaderElectionService.notLeader();

		assertThat(fencingTokens.take(), is(equalTo(NULL_FENCING_TOKEN)));

		assertThat(submittedJobGraphStore.contains(jobId), is(true));

		assertThat(recoveredJobFuture.isDone(), is(false));

		// re-grant leadership
		leaderElectionService.isLeader(DispatcherId.generate().toUUID());

		assertThat(recoveredJobFuture.get(), is(equalTo(jobId)));
	} finally {
		RpcUtils.terminateRpcEndpoint(dispatcher, timeout);
	}
}
 
Example 19
Source File: PackagedProgramUtils.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Creates a {@link JobGraph} with a specified {@link JobID}
 * from the given {@link PackagedProgram}.
 *
 * @param packagedProgram to extract the JobGraph from
 * @param configuration to use for the optimizer and job graph generator
 * @param defaultParallelism for the JobGraph
 * @param jobID the pre-generated job id
 * @return JobGraph extracted from the PackagedProgram
 * @throws ProgramInvocationException if the JobGraph generation failed
 */
public static JobGraph createJobGraph(
		PackagedProgram packagedProgram,
		Configuration configuration,
		int defaultParallelism,
		@Nullable JobID jobID) throws ProgramInvocationException {
	Thread.currentThread().setContextClassLoader(packagedProgram.getUserCodeClassLoader());
	final Optimizer optimizer = new Optimizer(new DataStatistics(), new DefaultCostEstimator(), configuration);
	final FlinkPlan flinkPlan;

	if (packagedProgram.isUsingProgramEntryPoint()) {

		final JobWithJars jobWithJars = packagedProgram.getPlanWithJars();

		final Plan plan = jobWithJars.getPlan();

		if (plan.getDefaultParallelism() <= 0) {
			plan.setDefaultParallelism(defaultParallelism);
		}

		flinkPlan = optimizer.compile(jobWithJars.getPlan());
	} else if (packagedProgram.isUsingInteractiveMode()) {
		final OptimizerPlanEnvironment optimizerPlanEnvironment = new OptimizerPlanEnvironment(optimizer);

		optimizerPlanEnvironment.setParallelism(defaultParallelism);

		flinkPlan = optimizerPlanEnvironment.getOptimizedPlan(packagedProgram);
	} else {
		throw new ProgramInvocationException("PackagedProgram does not have a valid invocation mode.");
	}

	final JobGraph jobGraph;

	if (flinkPlan instanceof StreamingPlan) {
		jobGraph = ((StreamingPlan) flinkPlan).getJobGraph(jobID);
		jobGraph.setSavepointRestoreSettings(packagedProgram.getSavepointSettings());
	} else {
		final JobGraphGenerator jobGraphGenerator = new JobGraphGenerator(configuration);
		jobGraph = jobGraphGenerator.compileJobGraph((OptimizedPlan) flinkPlan, jobID);
	}

	for (URL url : packagedProgram.getAllLibraries()) {
		try {
			jobGraph.addJar(new Path(url.toURI()));
		} catch (URISyntaxException e) {
			throw new ProgramInvocationException("Invalid URL for jar file: " + url + '.', jobGraph.getJobID(), e);
		}
	}

	jobGraph.setClasspaths(packagedProgram.getClasspaths());

	return jobGraph;
}
 
Example 20
Source File: RescalingITCase.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that a job cannot be restarted from a savepoint with a different parallelism if the
 * rescaled operator has non-partitioned state.
 *
 * @throws Exception
 */
@Test
public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
	final int parallelism = numSlots / 2;
	final int parallelism2 = numSlots;
	final int maxParallelism = 13;

	Duration timeout = Duration.ofMinutes(3);
	Deadline deadline = Deadline.now().plus(timeout);

	ClusterClient<?> client = cluster.getClusterClient();

	try {
		JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		final JobID jobID = jobGraph.getJobID();

		client.setDetached(true);
		client.submitJob(jobGraph, RescalingITCase.class.getClassLoader());

		// wait until the operator is started
		StateSourceBase.workStartedLatch.await();

		CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null);

		final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		client.cancel(jobID);

		while (!getRunningJobs(client).isEmpty()) {
			Thread.sleep(50);
		}

		// job successfully removed
		JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));

		client.setDetached(false);
		client.submitJob(scaledJobGraph, RescalingITCase.class.getClassLoader());
	} catch (JobExecutionException exception) {
		if (exception.getCause() instanceof IllegalStateException) {
			// we expect a IllegalStateException wrapped
			// in a JobExecutionException, because the job containing non-partitioned state
			// is being rescaled
		} else {
			throw exception;
		}
	}
}