Java Code Examples for org.apache.flink.runtime.jobgraph.JobGraph#setSavepointRestoreSettings()

The following examples show how to use org.apache.flink.runtime.jobgraph.JobGraph#setSavepointRestoreSettings() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ClusterClient.java    From flink with Apache License 2.0 6 votes vote down vote up
public static JobGraph getJobGraph(Configuration flinkConfig, FlinkPlan optPlan, List<URL> jarFiles, List<URL> classpaths, SavepointRestoreSettings savepointSettings) {
	JobGraph job;
	if (optPlan instanceof StreamingPlan) {
		job = ((StreamingPlan) optPlan).getJobGraph();
		job.setSavepointRestoreSettings(savepointSettings);
	} else {
		JobGraphGenerator gen = new JobGraphGenerator(flinkConfig);
		job = gen.compileJobGraph((OptimizedPlan) optPlan);
	}

	for (URL jar : jarFiles) {
		try {
			job.addJar(new Path(jar.toURI()));
		} catch (URISyntaxException e) {
			throw new RuntimeException("URL is invalid. This should not happen.", e);
		}
	}

	job.setClasspaths(classpaths);

	return job;
}
 
Example 2
Source File: ClassPathJobGraphRetriever.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
public JobGraph retrieveJobGraph(Configuration configuration) throws FlinkException {
	final PackagedProgram packagedProgram = createPackagedProgram();
	final int defaultParallelism = configuration.getInteger(CoreOptions.DEFAULT_PARALLELISM);
	try {
		final JobGraph jobGraph = PackagedProgramUtils.createJobGraph(
			packagedProgram,
			configuration,
			defaultParallelism,
			jobId);
		jobGraph.setAllowQueuedScheduling(true);
		jobGraph.setSavepointRestoreSettings(savepointRestoreSettings);

		return jobGraph;
	} catch (Exception e) {
		throw new FlinkException("Could not create the JobGraph from the provided user code jar.", e);
	}
}
 
Example 3
Source File: JobMasterTest.java    From flink with Apache License 2.0 6 votes vote down vote up
@Nonnull
private JobGraph createJobGraphFromJobVerticesWithCheckpointing(SavepointRestoreSettings savepointRestoreSettings, JobVertex... jobVertices) {
	final JobGraph jobGraph = new JobGraph(jobVertices);

	// enable checkpointing which is required to resume from a savepoint
	final CheckpointCoordinatorConfiguration checkpoinCoordinatorConfiguration = new CheckpointCoordinatorConfiguration(
		1000L,
		1000L,
		1000L,
		1,
		CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
		true,
		false,
		0);
	final JobCheckpointingSettings checkpointingSettings = new JobCheckpointingSettings(
		Collections.emptyList(),
		Collections.emptyList(),
		Collections.emptyList(),
		checkpoinCoordinatorConfiguration,
		null);
	jobGraph.setSnapshotSettings(checkpointingSettings);
	jobGraph.setSavepointRestoreSettings(savepointRestoreSettings);

	return jobGraph;
}
 
Example 4
Source File: DFCusterClient.java    From df_data_service with Apache License 2.0 6 votes vote down vote up
private JobGraph getJobGraph(FlinkPlan optPlan, List<URL> jarFiles, List<URL> classpaths, SavepointRestoreSettings savepointSettings) {
	JobGraph job;
	if (optPlan instanceof StreamingPlan) {
		job = ((StreamingPlan) optPlan).getJobGraph();
		job.setSavepointRestoreSettings(savepointSettings);
	} else {
		JobGraphGenerator gen = new JobGraphGenerator(this.flinkConfig);
		job = gen.compileJobGraph((OptimizedPlan) optPlan);
	}

	for (URL jar : jarFiles) {
		try {
			job.addJar(new Path(jar.toURI()));
		} catch (URISyntaxException e) {
			throw new RuntimeException("URL is invalid. This should not happen.", e);
		}
	}

	job.setClasspaths(classpaths);

	return job;
}
 
Example 5
Source File: AbstractOperatorRestoreTestBase.java    From flink with Apache License 2.0 6 votes vote down vote up
private void restoreJob(ClusterClient<?> clusterClient, Deadline deadline, String savepointPath) throws Exception {
	JobGraph jobToRestore = createJobGraph(ExecutionMode.RESTORE);
	jobToRestore.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, allowNonRestoredState));

	assertNotNull("Job doesn't have a JobID.", jobToRestore.getJobID());

	ClientUtils.submitJob(clusterClient, jobToRestore);

	CompletableFuture<JobStatus> jobStatusFuture = FutureUtils.retrySuccessfulWithDelay(
		() -> clusterClient.getJobStatus(jobToRestore.getJobID()),
		Time.milliseconds(50),
		deadline,
		(jobStatus) -> jobStatus == JobStatus.FINISHED,
		TestingUtils.defaultScheduledExecutor());
	assertEquals(
		JobStatus.FINISHED,
		jobStatusFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
}
 
Example 6
Source File: JobMasterTest.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Nonnull
private JobGraph createJobGraphFromJobVerticesWithCheckpointing(SavepointRestoreSettings savepointRestoreSettings, JobVertex... jobVertices) {
	final JobGraph jobGraph = new JobGraph(jobVertices);

	// enable checkpointing which is required to resume from a savepoint
	final CheckpointCoordinatorConfiguration checkpoinCoordinatorConfiguration = new CheckpointCoordinatorConfiguration(
		1000L,
		1000L,
		1000L,
		1,
		CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
		true);
	final JobCheckpointingSettings checkpointingSettings = new JobCheckpointingSettings(
		Collections.emptyList(),
		Collections.emptyList(),
		Collections.emptyList(),
		checkpoinCoordinatorConfiguration,
		null);
	jobGraph.setSnapshotSettings(checkpointingSettings);
	jobGraph.setSavepointRestoreSettings(savepointRestoreSettings);

	return jobGraph;
}
 
Example 7
Source File: ClassPathJobGraphRetriever.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Override
public JobGraph retrieveJobGraph(Configuration configuration) throws FlinkException {
	final PackagedProgram packagedProgram = createPackagedProgram();
	final int defaultParallelism = configuration.getInteger(CoreOptions.DEFAULT_PARALLELISM);
	try {
		final JobGraph jobGraph = PackagedProgramUtils.createJobGraph(
			packagedProgram,
			configuration,
			defaultParallelism,
			jobId);
		jobGraph.setAllowQueuedScheduling(true);
		jobGraph.setSavepointRestoreSettings(savepointRestoreSettings);

		return jobGraph;
	} catch (Exception e) {
		throw new FlinkException("Could not create the JobGraph from the provided user code jar.", e);
	}
}
 
Example 8
Source File: PipelineExecutorUtils.java    From flink with Apache License 2.0 6 votes vote down vote up
/**
 * Creates the {@link JobGraph} corresponding to the provided {@link Pipeline}.
 *
 * @param pipeline the pipeline whose job graph we are computing
 * @param configuration the configuration with the necessary information such as jars and
 *                         classpaths to be included, the parallelism of the job and potential
 *                         savepoint settings used to bootstrap its state.
 * @return the corresponding {@link JobGraph}.
 */
public static JobGraph getJobGraph(@Nonnull final Pipeline pipeline, @Nonnull final Configuration configuration) throws MalformedURLException {
	checkNotNull(pipeline);
	checkNotNull(configuration);

	final ExecutionConfigAccessor executionConfigAccessor = ExecutionConfigAccessor.fromConfiguration(configuration);
	final JobGraph jobGraph = FlinkPipelineTranslationUtil
			.getJobGraph(pipeline, configuration, executionConfigAccessor.getParallelism());

	configuration
			.getOptional(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID)
			.ifPresent(strJobID -> jobGraph.setJobID(JobID.fromHexString(strJobID)));

	jobGraph.addJars(executionConfigAccessor.getJars());
	jobGraph.setClasspaths(executionConfigAccessor.getClasspaths());
	jobGraph.setSavepointRestoreSettings(executionConfigAccessor.getSavepointRestoreSettings());

	return jobGraph;
}
 
Example 9
Source File: StatefulFunctionsJobGraphRetriever.java    From stateful-functions with Apache License 2.0 6 votes vote down vote up
@Override
public JobGraph retrieveJobGraph(Configuration configuration) throws FlinkException {
  final PackagedProgram packagedProgram = createPackagedProgram();

  final int defaultParallelism = configuration.getInteger(CoreOptions.DEFAULT_PARALLELISM);
  try {
    final JobGraph jobGraph =
        PackagedProgramUtils.createJobGraph(
            packagedProgram, configuration, defaultParallelism, jobId, false);
    jobGraph.setSavepointRestoreSettings(savepointRestoreSettings);

    return jobGraph;
  } catch (Exception e) {
    throw new FlinkException("Could not create the JobGraph from the provided user code jar.", e);
  }
}
 
Example 10
Source File: AbstractOperatorRestoreTestBase.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
private void restoreJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline, String savepointPath) throws Exception {
	JobGraph jobToRestore = createJobGraph(ExecutionMode.RESTORE);
	jobToRestore.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, allowNonRestoredState));

	assertNotNull("Job doesn't have a JobID.", jobToRestore.getJobID());

	clusterClient.submitJob(jobToRestore, classLoader);

	CompletableFuture<JobStatus> jobStatusFuture = FutureUtils.retrySuccessfulWithDelay(
		() -> clusterClient.getJobStatus(jobToRestore.getJobID()),
		Time.milliseconds(50),
		deadline,
		(jobStatus) -> jobStatus == JobStatus.FINISHED,
		TestingUtils.defaultScheduledExecutor());
	assertEquals(
		JobStatus.FINISHED,
		jobStatusFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
}
 
Example 11
Source File: SavepointITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		// Await state is restored
		StatefulCounter.getRestoreLatch().await();

		// Await some progress after restore
		StatefulCounter.getProgressLatch().await();

		client.cancel(jobId);

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobId),
			Time.milliseconds(50),
			Deadline.now().plus(Duration.ofSeconds(30)),
			status -> status == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor()
		);

		client.disposeSavepoint(savepointPath)
			.get();

		assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists());
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}
 
Example 12
Source File: FlinkRequiresStableInputTest.java    From beam with Apache License 2.0 5 votes vote down vote up
private JobID restoreFromSavepoint(Pipeline pipeline, String savepointDir)
    throws ExecutionException, InterruptedException {
  JobGraph jobGraph = getJobGraph(pipeline);
  SavepointRestoreSettings savepointSettings = SavepointRestoreSettings.forPath(savepointDir);
  jobGraph.setSavepointRestoreSettings(savepointSettings);
  return flinkCluster.submitJob(jobGraph).get().getJobID();
}
 
Example 13
Source File: SavepointITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		// Await state is restored
		StatefulCounter.getRestoreLatch().await();

		// Await some progress after restore
		StatefulCounter.getProgressLatch().await();

		client.cancel(jobId);

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobId),
			Time.milliseconds(50),
			Deadline.now().plus(Duration.ofSeconds(30)),
			status -> status == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor()
		);

		client.disposeSavepoint(savepointPath)
			.get();

		assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists());
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}
 
Example 14
Source File: SavepointWriterITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private void validateModification(String savepointPath) throws ProgramInvocationException {
	StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment();
	sEnv.setStateBackend(backend);

	CollectSink.accountList.clear();

	DataStream<Account> stream = sEnv.fromCollection(accounts)
		.keyBy(acc -> acc.id)
		.flatMap(new UpdateAndGetAccount())
		.uid(ACCOUNT_UID);

	stream.addSink(new CollectSink());

	stream
		.map(acc -> acc.id)
		.map(new StatefulOperator())
		.uid(MODIFY_UID)
		.addSink(new DiscardingSink<>());

	JobGraph jobGraph = sEnv.getStreamGraph().getJobGraph();
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, false));

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.submitJob(jobGraph, SavepointWriterITCase.class.getClassLoader());

	Assert.assertEquals("Unexpected output", 3, CollectSink.accountList.size());
}
 
Example 15
Source File: JobMasterTest.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that a JobMaster will only restore a modified JobGraph if non
 * restored state is allowed.
 */
@Test
public void testRestoringModifiedJobFromSavepoint() throws Exception {

	// create savepoint data
	final long savepointId = 42L;
	final OperatorID operatorID = new OperatorID();
	final File savepointFile = createSavepointWithOperatorState(savepointId, operatorID);

	// set savepoint settings which don't allow non restored state
	final SavepointRestoreSettings savepointRestoreSettings = SavepointRestoreSettings.forPath(
		savepointFile.getAbsolutePath(),
		false);

	// create a new operator
	final JobVertex jobVertex = new JobVertex("New operator");
	jobVertex.setInvokableClass(NoOpInvokable.class);
	final JobGraph jobGraphWithNewOperator = createJobGraphFromJobVerticesWithCheckpointing(savepointRestoreSettings, jobVertex);

	final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
	final TestingCheckpointRecoveryFactory testingCheckpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, new StandaloneCheckpointIDCounter());
	haServices.setCheckpointRecoveryFactory(testingCheckpointRecoveryFactory);

	try {
		createJobMaster(
			configuration,
			jobGraphWithNewOperator,
			haServices,
			new TestingJobManagerSharedServicesBuilder().build());
		fail("Should fail because we cannot resume the changed JobGraph from the savepoint.");
	} catch (IllegalStateException expected) {
		// that was expected :-)
	}

	// allow for non restored state
	jobGraphWithNewOperator.setSavepointRestoreSettings(
		SavepointRestoreSettings.forPath(
			savepointFile.getAbsolutePath(),
			true));

	final JobMaster jobMaster = createJobMaster(
		configuration,
		jobGraphWithNewOperator,
		haServices,
		new TestingJobManagerSharedServicesBuilder().build());

	try {
		// starting the JobMaster should have read the savepoint
		final CompletedCheckpoint savepointCheckpoint = completedCheckpointStore.getLatestCheckpoint();

		assertThat(savepointCheckpoint, Matchers.notNullValue());

		assertThat(savepointCheckpoint.getCheckpointID(), is(savepointId));
	} finally {
		RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
	}
}
 
Example 16
Source File: AbstractOperatorRestoreTestBase.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
private String migrateJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline) throws Throwable {

		URL savepointResource = AbstractOperatorRestoreTestBase.class.getClassLoader().getResource("operatorstate/" + getMigrationSavepointName());
		if (savepointResource == null) {
			throw new IllegalArgumentException("Savepoint file does not exist.");
		}
		JobGraph jobToMigrate = createJobGraph(ExecutionMode.MIGRATE);
		jobToMigrate.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointResource.getFile()));

		assertNotNull(jobToMigrate.getJobID());

		clusterClient.submitJob(jobToMigrate, classLoader);

		CompletableFuture<JobStatus> jobRunningFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.RUNNING,
			jobRunningFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		// Trigger savepoint
		File targetDirectory = tmpFolder.newFolder();
		String savepointPath = null;

		// FLINK-6918: Retry cancel with savepoint message in case that StreamTasks were not running
		// TODO: The retry logic should be removed once the StreamTask lifecycle has been fixed (see FLINK-4714)
		while (deadline.hasTimeLeft() && savepointPath == null) {
			try {
				savepointPath = clusterClient.cancelWithSavepoint(
					jobToMigrate.getJobID(),
					targetDirectory.getAbsolutePath());
			} catch (Exception e) {
				String exceptionString = ExceptionUtils.stringifyException(e);
				if (!(exceptionString.matches("(.*\n)*.*savepoint for the job .* failed(.*\n)*") // legacy
						|| exceptionString.matches("(.*\n)*.*was not running(.*\n)*")
						|| exceptionString.matches("(.*\n)*.*Not all required tasks are currently running(.*\n)*") // new
						|| exceptionString.matches("(.*\n)*.*Checkpoint was declined \\(tasks not ready\\)(.*\n)*"))) { // new
					throw e;
				}
			}
		}

		assertNotNull("Could not take savepoint.", savepointPath);

		CompletableFuture<JobStatus> jobCanceledFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.CANCELED,
			jobCanceledFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		return savepointPath;
	}
 
Example 17
Source File: AbstractOperatorRestoreTestBase.java    From flink with Apache License 2.0 4 votes vote down vote up
private String migrateJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline) throws Throwable {

		URL savepointResource = AbstractOperatorRestoreTestBase.class.getClassLoader().getResource("operatorstate/" + getMigrationSavepointName());
		if (savepointResource == null) {
			throw new IllegalArgumentException("Savepoint file does not exist.");
		}
		JobGraph jobToMigrate = createJobGraph(ExecutionMode.MIGRATE);
		jobToMigrate.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointResource.getFile()));

		assertNotNull(jobToMigrate.getJobID());

		clusterClient.submitJob(jobToMigrate, classLoader);

		CompletableFuture<JobStatus> jobRunningFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.RUNNING,
			jobRunningFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		// Trigger savepoint
		File targetDirectory = tmpFolder.newFolder();
		String savepointPath = null;

		// FLINK-6918: Retry cancel with savepoint message in case that StreamTasks were not running
		// TODO: The retry logic should be removed once the StreamTask lifecycle has been fixed (see FLINK-4714)
		while (deadline.hasTimeLeft() && savepointPath == null) {
			try {
				savepointPath = clusterClient.cancelWithSavepoint(
					jobToMigrate.getJobID(),
					targetDirectory.getAbsolutePath());
			} catch (Exception e) {
				String exceptionString = ExceptionUtils.stringifyException(e);
				if (!PATTERN_CANCEL_WITH_SAVEPOINT_TOLERATED_EXCEPTIONS.matcher(exceptionString).find()) {
					throw e;
				}
			}
		}

		assertNotNull("Could not take savepoint.", savepointPath);

		CompletableFuture<JobStatus> jobCanceledFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.CANCELED,
			jobCanceledFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		return savepointPath;
	}
 
Example 18
Source File: JobMasterTest.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that a JobMaster will only restore a modified JobGraph if non
 * restored state is allowed.
 */
@Test
public void testRestoringModifiedJobFromSavepoint() throws Exception {

	// create savepoint data
	final long savepointId = 42L;
	final OperatorID operatorID = new OperatorID();
	final File savepointFile = createSavepointWithOperatorState(savepointId, operatorID);

	// set savepoint settings which don't allow non restored state
	final SavepointRestoreSettings savepointRestoreSettings = SavepointRestoreSettings.forPath(
		savepointFile.getAbsolutePath(),
		false);

	// create a new operator
	final JobVertex jobVertex = new JobVertex("New operator");
	jobVertex.setInvokableClass(NoOpInvokable.class);
	final JobGraph jobGraphWithNewOperator = createJobGraphFromJobVerticesWithCheckpointing(savepointRestoreSettings, jobVertex);

	final StandaloneCompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
	final TestingCheckpointRecoveryFactory testingCheckpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, new StandaloneCheckpointIDCounter());
	haServices.setCheckpointRecoveryFactory(testingCheckpointRecoveryFactory);

	try {
		createJobMaster(
			configuration,
			jobGraphWithNewOperator,
			haServices,
			new TestingJobManagerSharedServicesBuilder().build());
		fail("Should fail because we cannot resume the changed JobGraph from the savepoint.");
	} catch (IllegalStateException expected) {
		// that was expected :-)
	}

	// allow for non restored state
	jobGraphWithNewOperator.setSavepointRestoreSettings(
		SavepointRestoreSettings.forPath(
			savepointFile.getAbsolutePath(),
			true));

	final JobMaster jobMaster = createJobMaster(
		configuration,
		jobGraphWithNewOperator,
		haServices,
		new TestingJobManagerSharedServicesBuilder().build());

	try {
		// starting the JobMaster should have read the savepoint
		final CompletedCheckpoint savepointCheckpoint = completedCheckpointStore.getLatestCheckpoint(false);

		assertThat(savepointCheckpoint, Matchers.notNullValue());

		assertThat(savepointCheckpoint.getCheckpointID(), is(savepointId));
	} finally {
		RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
	}
}
 
Example 19
Source File: RescalingITCase.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that a job cannot be restarted from a savepoint with a different parallelism if the
 * rescaled operator has non-partitioned state.
 *
 * @throws Exception
 */
@Test
public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
	final int parallelism = numSlots / 2;
	final int parallelism2 = numSlots;
	final int maxParallelism = 13;

	Duration timeout = Duration.ofMinutes(3);
	Deadline deadline = Deadline.now().plus(timeout);

	ClusterClient<?> client = cluster.getClusterClient();

	try {
		JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		final JobID jobID = jobGraph.getJobID();

		client.setDetached(true);
		client.submitJob(jobGraph, RescalingITCase.class.getClassLoader());

		// wait until the operator is started
		StateSourceBase.workStartedLatch.await();

		CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null);

		final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		client.cancel(jobID);

		while (!getRunningJobs(client).isEmpty()) {
			Thread.sleep(50);
		}

		// job successfully removed
		JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));

		client.setDetached(false);
		client.submitJob(scaledJobGraph, RescalingITCase.class.getClassLoader());
	} catch (JobExecutionException exception) {
		if (exception.getCause() instanceof IllegalStateException) {
			// we expect a IllegalStateException wrapped
			// in a JobExecutionException, because the job containing non-partitioned state
			// is being rescaled
		} else {
			throw exception;
		}
	}
}
 
Example 20
Source File: FlinkPravegaReaderSavepointITCase.java    From flink-connectors with Apache License 2.0 4 votes vote down vote up
@Test
public void testPravegaWithSavepoint() throws Exception {
    final int sourceParallelism = 4;
    final int numPravegaSegments = 4;
    final int numElements = NUM_STREAM_ELEMENTS;

    // set up the stream
    final String streamName = RandomStringUtils.randomAlphabetic(20);
    SETUP_UTILS.createTestStream(streamName, numPravegaSegments);

    // we create two independent Flink jobs (that come from the same program)
    final JobGraph program1 = getFlinkJob(sourceParallelism, streamName, numElements);

    try (
            final EventStreamWriter<Integer> eventWriter = SETUP_UTILS.getIntegerWriter(streamName);

            // create the producer that writes to the stream
            final ThrottledIntegerWriter producer = new ThrottledIntegerWriter(
                    eventWriter,
                    numElements,
                    numElements / 2,  // the latest when the thread must be un-throttled
                    1,                 // the initial sleep time per element
                    false
            )

    ) {
        // the object on which we block while waiting for the checkpoint completion
        final OneShotLatch sync = new OneShotLatch();
        NotifyingMapper.TO_CALL_ON_COMPLETION.set( sync::trigger );

        // launch the Flink program from a separate thread
        final CheckedThread flinkRunner = new CheckedThread() {
            @Override
            public void go() throws Exception {
                MINI_CLUSTER.submitJob(program1);
            }
        };

        producer.start();
        flinkRunner.start();

        // wait until at least one checkpoint is complete before triggering the safepoints
        sync.await();

        // now that we are comfortably into the program, trigger a savepoint
        String savepointPath = null;

        // since with the short timeouts we configure in these tests, Pravega Checkpoints
        // sometimes don't complete in time, we retry a bit here
        for (int attempt = 1; savepointPath == null && attempt <= 5; attempt++) {
            savepointPath = MINI_CLUSTER.triggerSavepoint(program1.getJobID(), tmpFolder.newFolder().getAbsolutePath(), false).get();
        }

        assertNotNull("Failed to trigger a savepoint", savepointPath);

        // now cancel the job and relaunch a new one
        MINI_CLUSTER.cancelJob(program1.getJobID());

        try {
            // this throws an exception that the job was cancelled
            flinkRunner.sync();
        } catch (JobCancellationException ignored) {
        }

        producer.unthrottle();

        // now, resume with a new program
        final JobGraph program2 = getFlinkJob(sourceParallelism, streamName, numElements);
        program2.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, false));

        // if these calls complete without exception, then the test passes
        try {
            MINI_CLUSTER.executeJobBlocking(program2);
        } catch (Exception e) {
            if (!(ExceptionUtils.getRootCause(e) instanceof SuccessException)) {
                throw e;
            }
        }
    }
}