Java Code Examples for org.apache.flink.client.program.ClusterClient#submitJob()

The following examples show how to use org.apache.flink.client.program.ClusterClient#submitJob() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AbstractOperatorRestoreTestBase.java    From flink with Apache License 2.0 6 votes vote down vote up
private void restoreJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline, String savepointPath) throws Exception {
	JobGraph jobToRestore = createJobGraph(ExecutionMode.RESTORE);
	jobToRestore.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, allowNonRestoredState));

	assertNotNull("Job doesn't have a JobID.", jobToRestore.getJobID());

	clusterClient.submitJob(jobToRestore, classLoader);

	CompletableFuture<JobStatus> jobStatusFuture = FutureUtils.retrySuccessfulWithDelay(
		() -> clusterClient.getJobStatus(jobToRestore.getJobID()),
		Time.milliseconds(50),
		deadline,
		(jobStatus) -> jobStatus == JobStatus.FINISHED,
		TestingUtils.defaultScheduledExecutor());
	assertEquals(
		JobStatus.FINISHED,
		jobStatusFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
}
 
Example 2
Source File: SavepointITCase.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
private String submitJobAndTakeSavepoint(MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		StatefulCounter.getProgressLatch().await();

		return client.cancelWithSavepoint(jobId, null);
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}
 
Example 3
Source File: SavepointITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
private String submitJobAndTakeSavepoint(MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		StatefulCounter.getProgressLatch().await();

		return client.cancelWithSavepoint(jobId, null);
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}
 
Example 4
Source File: SavepointITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		// Await state is restored
		StatefulCounter.getRestoreLatch().await();

		// Await some progress after restore
		StatefulCounter.getProgressLatch().await();

		client.cancel(jobId);

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobId),
			Time.milliseconds(50),
			Deadline.now().plus(Duration.ofSeconds(30)),
			status -> status == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor()
		);

		client.disposeSavepoint(savepointPath)
			.get();

		assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists());
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}
 
Example 5
Source File: NetworkStackThroughputITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private void testProgram(
		final MiniClusterWithClientResource cluster,
		final int dataVolumeGb,
		final boolean useForwarder,
		final boolean isSlowSender,
		final boolean isSlowReceiver,
		final int parallelism) throws Exception {
	ClusterClient<?> client = cluster.getClusterClient();
	client.setDetached(false);
	client.setPrintStatusDuringExecution(false);

	JobExecutionResult jer = (JobExecutionResult) client.submitJob(
		createJobGraph(
			dataVolumeGb,
			useForwarder,
			isSlowSender,
			isSlowReceiver,
			parallelism),
		getClass().getClassLoader());

	long dataVolumeMbit = dataVolumeGb * 8192;
	long runtimeSecs = jer.getNetRuntime(TimeUnit.SECONDS);

	int mbitPerSecond = (int) (((double) dataVolumeMbit) / runtimeSecs);

	LOG.info(String.format("Test finished with throughput of %d MBit/s (runtime [secs]: %d, " +
		"data volume [gb/mbits]: %d/%d)", mbitPerSecond, runtimeSecs, dataVolumeGb, dataVolumeMbit));
}
 
Example 6
Source File: SavepointWriterITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private void validateBootstrap(String savepointPath) throws ProgramInvocationException {
	StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment();
	sEnv.setStateBackend(backend);

	CollectSink.accountList.clear();

	sEnv.fromCollection(accounts)
		.keyBy(acc -> acc.id)
		.flatMap(new UpdateAndGetAccount())
		.uid(ACCOUNT_UID)
		.addSink(new CollectSink());

	sEnv
		.fromCollection(currencyRates)
		.connect(sEnv.fromCollection(currencyRates).broadcast(descriptor))
		.process(new CurrencyValidationFunction())
		.uid(CURRENCY_UID)
		.addSink(new DiscardingSink<>());

	JobGraph jobGraph = sEnv.getStreamGraph().getJobGraph();
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath, false));

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.submitJob(jobGraph, SavepointWriterITCase.class.getClassLoader());

	Assert.assertEquals("Unexpected output", 3, CollectSink.accountList.size());
}
 
Example 7
Source File: SavepointITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private void restoreJobAndVerifyState(String savepointPath, MiniClusterResourceFactory clusterFactory, int parallelism) throws Exception {
	final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
	final JobID jobId = jobGraph.getJobID();
	StatefulCounter.resetForTest(parallelism);

	MiniClusterWithClientResource cluster = clusterFactory.get();
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {
		client.setDetached(true);
		client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());

		// Await state is restored
		StatefulCounter.getRestoreLatch().await();

		// Await some progress after restore
		StatefulCounter.getProgressLatch().await();

		client.cancel(jobId);

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobId),
			Time.milliseconds(50),
			Deadline.now().plus(Duration.ofSeconds(30)),
			status -> status == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor()
		);

		client.disposeSavepoint(savepointPath)
			.get();

		assertFalse("Savepoint not properly cleaned up.", new File(savepointPath).exists());
	} finally {
		cluster.after();
		StatefulCounter.resetForTest(parallelism);
	}
}
 
Example 8
Source File: CancelingTestBase.java    From flink with Apache License 2.0 5 votes vote down vote up
protected void runAndCancelJob(Plan plan, final int msecsTillCanceling, int maxTimeTillCanceled) throws Exception {
	// submit job
	final JobGraph jobGraph = getJobGraph(plan);

	ClusterClient<?> client = CLUSTER.getClusterClient();
	client.setDetached(true);

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, CancelingTestBase.class.getClassLoader());

	Deadline submissionDeadLine = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();

	JobStatus jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	while (jobStatus != JobStatus.RUNNING && submissionDeadLine.hasTimeLeft()) {
		Thread.sleep(50);
		jobStatus = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	}
	if (jobStatus != JobStatus.RUNNING) {
		Assert.fail("Job not in state RUNNING.");
	}

	Thread.sleep(msecsTillCanceling);

	client.cancel(jobSubmissionResult.getJobID());

	Deadline cancelDeadline = new FiniteDuration(maxTimeTillCanceled, TimeUnit.MILLISECONDS).fromNow();

	JobStatus jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	while (jobStatusAfterCancel != JobStatus.CANCELED && cancelDeadline.hasTimeLeft()) {
		Thread.sleep(50);
		jobStatusAfterCancel = client.getJobStatus(jobSubmissionResult.getJobID()).get(GET_FUTURE_TIMEOUT, TimeUnit.MILLISECONDS);
	}
	if (jobStatusAfterCancel != JobStatus.CANCELED) {
		Assert.fail("Failed to cancel job with ID " + jobSubmissionResult.getJobID() + '.');
	}
}
 
Example 9
Source File: SavepointReaderKeyedStateITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private String takeSavepoint(JobGraph jobGraph) throws Exception {
	SavepointSource.initializeForTest();

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	JobID jobId = jobGraph.getJobID();

	Deadline deadline = Deadline.fromNow(Duration.ofMinutes(5));

	String dirPath = getTempDirPath(new AbstractID().toHexString());

	try {
		client.setDetached(true);
		JobSubmissionResult result = client.submitJob(jobGraph, getClass().getClassLoader());

		boolean finished = false;
		while (deadline.hasTimeLeft()) {
			if (SavepointSource.isFinished()) {
				finished = true;

				break;
			}
		}

		if (!finished) {
			Assert.fail("Failed to initialize state within deadline");
		}

		CompletableFuture<String> path = client.triggerSavepoint(result.getJobID(), dirPath);
		return path.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
	} finally {
		client.cancel(jobId);
	}
}
 
Example 10
Source File: NetworkStackThroughputITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private void testProgram(
		final MiniClusterWithClientResource cluster,
		final int dataVolumeGb,
		final boolean useForwarder,
		final boolean isSlowSender,
		final boolean isSlowReceiver,
		final int parallelism) throws Exception {
	ClusterClient<?> client = cluster.getClusterClient();
	client.setDetached(false);
	client.setPrintStatusDuringExecution(false);

	JobExecutionResult jer = (JobExecutionResult) client.submitJob(
		createJobGraph(
			dataVolumeGb,
			useForwarder,
			isSlowSender,
			isSlowReceiver,
			parallelism),
		getClass().getClassLoader());

	long dataVolumeMbit = dataVolumeGb * 8192;
	long runtimeSecs = jer.getNetRuntime(TimeUnit.SECONDS);

	int mbitPerSecond = (int) (((double) dataVolumeMbit) / runtimeSecs);

	LOG.info(String.format("Test finished with throughput of %d MBit/s (runtime [secs]: %d, " +
		"data volume [gb/mbits]: %d/%d)", mbitPerSecond, runtimeSecs, dataVolumeGb, dataVolumeMbit));
}
 
Example 11
Source File: SavepointITCase.java    From flink with Apache License 2.0 4 votes vote down vote up
@Test
public void testSubmitWithUnknownSavepointPath() throws Exception {
	// Config
	int numTaskManagers = 1;
	int numSlotsPerTaskManager = 1;
	int parallelism = numTaskManagers * numSlotsPerTaskManager;

	final Configuration config = new Configuration();
	config.setString(CheckpointingOptions.SAVEPOINT_DIRECTORY, savepointDir.toURI().toString());

	MiniClusterWithClientResource cluster = new MiniClusterWithClientResource(
		new MiniClusterResourceConfiguration.Builder()
			.setConfiguration(config)
			.setNumberTaskManagers(numTaskManagers)
			.setNumberSlotsPerTaskManager(numSlotsPerTaskManager)
			.build());
	cluster.before();
	ClusterClient<?> client = cluster.getClusterClient();

	try {

		// High value to ensure timeouts if restarted.
		int numberOfRetries = 1000;
		// Submit the job
		// Long delay to ensure that the test times out if the job
		// manager tries to restart the job.
		final JobGraph jobGraph = createJobGraph(parallelism, numberOfRetries, 3600000);

		// Set non-existing savepoint path
		jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath("unknown path"));
		assertEquals("unknown path", jobGraph.getSavepointRestoreSettings().getRestorePath());

		LOG.info("Submitting job " + jobGraph.getJobID() + " in detached mode.");

		try {
			client.setDetached(false);
			client.submitJob(jobGraph, SavepointITCase.class.getClassLoader());
		} catch (Exception e) {
			Optional<JobExecutionException> expectedJobExecutionException = ExceptionUtils.findThrowable(e, JobExecutionException.class);
			Optional<FileNotFoundException> expectedFileNotFoundException = ExceptionUtils.findThrowable(e, FileNotFoundException.class);
			if (!(expectedJobExecutionException.isPresent() && expectedFileNotFoundException.isPresent())) {
				throw e;
			}
		}
	} finally {
		cluster.after();
	}
}
 
Example 12
Source File: AbstractOperatorRestoreTestBase.java    From flink with Apache License 2.0 4 votes vote down vote up
private String migrateJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline) throws Throwable {

		URL savepointResource = AbstractOperatorRestoreTestBase.class.getClassLoader().getResource("operatorstate/" + getMigrationSavepointName());
		if (savepointResource == null) {
			throw new IllegalArgumentException("Savepoint file does not exist.");
		}
		JobGraph jobToMigrate = createJobGraph(ExecutionMode.MIGRATE);
		jobToMigrate.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointResource.getFile()));

		assertNotNull(jobToMigrate.getJobID());

		clusterClient.submitJob(jobToMigrate, classLoader);

		CompletableFuture<JobStatus> jobRunningFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.RUNNING,
			jobRunningFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		// Trigger savepoint
		File targetDirectory = tmpFolder.newFolder();
		String savepointPath = null;

		// FLINK-6918: Retry cancel with savepoint message in case that StreamTasks were not running
		// TODO: The retry logic should be removed once the StreamTask lifecycle has been fixed (see FLINK-4714)
		while (deadline.hasTimeLeft() && savepointPath == null) {
			try {
				savepointPath = clusterClient.cancelWithSavepoint(
					jobToMigrate.getJobID(),
					targetDirectory.getAbsolutePath());
			} catch (Exception e) {
				String exceptionString = ExceptionUtils.stringifyException(e);
				if (!PATTERN_CANCEL_WITH_SAVEPOINT_TOLERATED_EXCEPTIONS.matcher(exceptionString).find()) {
					throw e;
				}
			}
		}

		assertNotNull("Could not take savepoint.", savepointPath);

		CompletableFuture<JobStatus> jobCanceledFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.CANCELED,
			jobCanceledFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		return savepointPath;
	}
 
Example 13
Source File: AccumulatorLiveITCase.java    From flink with Apache License 2.0 4 votes vote down vote up
private static void submitJobAndVerifyResults(JobGraph jobGraph) throws Exception {
	Deadline deadline = Deadline.now().plus(Duration.ofSeconds(30));

	final ClusterClient<?> client = MINI_CLUSTER_RESOURCE.getClusterClient();

	final CheckedThread submissionThread = new CheckedThread() {
		@Override
		public void go() throws Exception {
			client.submitJob(jobGraph, AccumulatorLiveITCase.class.getClassLoader());
		}
	};

	submissionThread.start();

	try {
		NotifyingMapper.notifyLatch.await();

		FutureUtils.retrySuccessfulWithDelay(
			() -> {
				try {
					return CompletableFuture.completedFuture(client.getAccumulators(jobGraph.getJobID()));
				} catch (Exception e) {
					return FutureUtils.completedExceptionally(e);
				}
			},
			Time.milliseconds(20),
			deadline,
			accumulators -> accumulators.size() == 1
				&& accumulators.containsKey(ACCUMULATOR_NAME)
				&& (int) accumulators.get(ACCUMULATOR_NAME).getUnchecked() == NUM_ITERATIONS,
			TestingUtils.defaultScheduledExecutor()
		).get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		NotifyingMapper.shutdownLatch.trigger();
	} finally {
		NotifyingMapper.shutdownLatch.trigger();

		// wait for the job to have terminated
		submissionThread.sync();
	}
}
 
Example 14
Source File: JMXJobManagerMetricTest.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that metrics registered on the JobManager are actually accessible via JMX.
 */
@Test
public void testJobManagerJMXMetricAccess() throws Exception {
	Deadline deadline = Deadline.now().plus(Duration.ofMinutes(2));

	try {
		JobVertex sourceJobVertex = new JobVertex("Source");
		sourceJobVertex.setInvokableClass(BlockingInvokable.class);

		JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
		jobGraph.setSnapshotSettings(new JobCheckpointingSettings(
			Collections.<JobVertexID>emptyList(),
			Collections.<JobVertexID>emptyList(),
			Collections.<JobVertexID>emptyList(),
			new CheckpointCoordinatorConfiguration(
				500,
				500,
				50,
				5,
				CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
				true),
			null));

		ClusterClient<?> client = MINI_CLUSTER_RESOURCE.getClusterClient();
		client.setDetached(true);
		client.submitJob(jobGraph, JMXJobManagerMetricTest.class.getClassLoader());

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobGraph.getJobID()),
			Time.milliseconds(10),
			deadline,
			status -> status == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor()
		).get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		MBeanServer mBeanServer = ManagementFactory.getPlatformMBeanServer();
		Set<ObjectName> nameSet = mBeanServer.queryNames(new ObjectName("org.apache.flink.jobmanager.job.lastCheckpointSize:job_name=TestingJob,*"), null);
		Assert.assertEquals(1, nameSet.size());
		assertEquals(-1L, mBeanServer.getAttribute(nameSet.iterator().next(), "Value"));

		BlockingInvokable.unblock();
	} finally {
		BlockingInvokable.unblock();
	}
}
 
Example 15
Source File: JMXJobManagerMetricTest.java    From flink with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that metrics registered on the JobManager are actually accessible via JMX.
 */
@Test
public void testJobManagerJMXMetricAccess() throws Exception {
	Deadline deadline = Deadline.now().plus(Duration.ofMinutes(2));

	try {
		JobVertex sourceJobVertex = new JobVertex("Source");
		sourceJobVertex.setInvokableClass(BlockingInvokable.class);

		JobGraph jobGraph = new JobGraph("TestingJob", sourceJobVertex);
		jobGraph.setSnapshotSettings(new JobCheckpointingSettings(
			Collections.<JobVertexID>emptyList(),
			Collections.<JobVertexID>emptyList(),
			Collections.<JobVertexID>emptyList(),
			new CheckpointCoordinatorConfiguration(
				500,
				500,
				50,
				5,
				CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION,
				true,
				false,
				0),
			null));

		ClusterClient<?> client = MINI_CLUSTER_RESOURCE.getClusterClient();
		client.setDetached(true);
		client.submitJob(jobGraph, JMXJobManagerMetricTest.class.getClassLoader());

		FutureUtils.retrySuccessfulWithDelay(
			() -> client.getJobStatus(jobGraph.getJobID()),
			Time.milliseconds(10),
			deadline,
			status -> status == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor()
		).get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		MBeanServer mBeanServer = ManagementFactory.getPlatformMBeanServer();
		Set<ObjectName> nameSet = mBeanServer.queryNames(new ObjectName("org.apache.flink.jobmanager.job.lastCheckpointSize:job_name=TestingJob,*"), null);
		Assert.assertEquals(1, nameSet.size());
		assertEquals(-1L, mBeanServer.getAttribute(nameSet.iterator().next(), "Value"));

		BlockingInvokable.unblock();
	} finally {
		BlockingInvokable.unblock();
	}
}
 
Example 16
Source File: SavepointMigrationTestBase.java    From flink with Apache License 2.0 4 votes vote down vote up
@SafeVarargs
protected final void executeAndSavepoint(
		StreamExecutionEnvironment env,
		String savepointPath,
		Tuple2<String, Integer>... expectedAccumulators) throws Exception {

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	// Submit the job
	JobGraph jobGraph = env.getStreamGraph().getJobGraph();

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader());

	LOG.info("Submitted job {} and waiting...", jobSubmissionResult.getJobID());

	boolean done = false;
	while (DEADLINE.hasTimeLeft()) {
		Thread.sleep(100);
		Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobSubmissionResult.getJobID());

		boolean allDone = true;
		for (Tuple2<String, Integer> acc : expectedAccumulators) {
			OptionalFailure<Object> accumOpt = accumulators.get(acc.f0);
			if (accumOpt == null) {
				allDone = false;
				break;
			}

			Integer numFinished = (Integer) accumOpt.get();
			if (numFinished == null) {
				allDone = false;
				break;
			}
			if (!numFinished.equals(acc.f1)) {
				allDone = false;
				break;
			}
		}
		if (allDone) {
			done = true;
			break;
		}
	}

	if (!done) {
		fail("Did not see the expected accumulator results within time limit.");
	}

	LOG.info("Triggering savepoint.");

	CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobSubmissionResult.getJobID(), null);

	String jobmanagerSavepointPath = savepointPathFuture.get(DEADLINE.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

	File jobManagerSavepoint = new File(new URI(jobmanagerSavepointPath).getPath());
	// savepoints were changed to be directories in Flink 1.3
	if (jobManagerSavepoint.isDirectory()) {
		FileUtils.moveDirectory(jobManagerSavepoint, new File(savepointPath));
	} else {
		FileUtils.moveFile(jobManagerSavepoint, new File(savepointPath));
	}
}
 
Example 17
Source File: SavepointMigrationTestBase.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
@SafeVarargs
protected final void executeAndSavepoint(
		StreamExecutionEnvironment env,
		String savepointPath,
		Tuple2<String, Integer>... expectedAccumulators) throws Exception {

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	// Submit the job
	JobGraph jobGraph = env.getStreamGraph().getJobGraph();

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader());

	LOG.info("Submitted job {} and waiting...", jobSubmissionResult.getJobID());

	boolean done = false;
	while (DEADLINE.hasTimeLeft()) {
		Thread.sleep(100);
		Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobSubmissionResult.getJobID());

		boolean allDone = true;
		for (Tuple2<String, Integer> acc : expectedAccumulators) {
			OptionalFailure<Object> accumOpt = accumulators.get(acc.f0);
			if (accumOpt == null) {
				allDone = false;
				break;
			}

			Integer numFinished = (Integer) accumOpt.get();
			if (numFinished == null) {
				allDone = false;
				break;
			}
			if (!numFinished.equals(acc.f1)) {
				allDone = false;
				break;
			}
		}
		if (allDone) {
			done = true;
			break;
		}
	}

	if (!done) {
		fail("Did not see the expected accumulator results within time limit.");
	}

	LOG.info("Triggering savepoint.");

	CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobSubmissionResult.getJobID(), null);

	String jobmanagerSavepointPath = savepointPathFuture.get(DEADLINE.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

	File jobManagerSavepoint = new File(new URI(jobmanagerSavepointPath).getPath());
	// savepoints were changed to be directories in Flink 1.3
	if (jobManagerSavepoint.isDirectory()) {
		FileUtils.moveDirectory(jobManagerSavepoint, new File(savepointPath));
	} else {
		FileUtils.moveFile(jobManagerSavepoint, new File(savepointPath));
	}
}
 
Example 18
Source File: RescalingITCase.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
/**
 * Tests that a job cannot be restarted from a savepoint with a different parallelism if the
 * rescaled operator has non-partitioned state.
 *
 * @throws Exception
 */
@Test
public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
	final int parallelism = numSlots / 2;
	final int parallelism2 = numSlots;
	final int maxParallelism = 13;

	Duration timeout = Duration.ofMinutes(3);
	Deadline deadline = Deadline.now().plus(timeout);

	ClusterClient<?> client = cluster.getClusterClient();

	try {
		JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		final JobID jobID = jobGraph.getJobID();

		client.setDetached(true);
		client.submitJob(jobGraph, RescalingITCase.class.getClassLoader());

		// wait until the operator is started
		StateSourceBase.workStartedLatch.await();

		CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null);

		final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);

		client.cancel(jobID);

		while (!getRunningJobs(client).isEmpty()) {
			Thread.sleep(50);
		}

		// job successfully removed
		JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);

		scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));

		client.setDetached(false);
		client.submitJob(scaledJobGraph, RescalingITCase.class.getClassLoader());
	} catch (JobExecutionException exception) {
		if (exception.getCause() instanceof IllegalStateException) {
			// we expect a IllegalStateException wrapped
			// in a JobExecutionException, because the job containing non-partitioned state
			// is being rescaled
		} else {
			throw exception;
		}
	}
}
 
Example 19
Source File: SavepointMigrationTestBase.java    From flink with Apache License 2.0 4 votes vote down vote up
@SafeVarargs
protected final void restoreAndExecute(
		StreamExecutionEnvironment env,
		String savepointPath,
		Tuple2<String, Integer>... expectedAccumulators) throws Exception {

	ClusterClient<?> client = miniClusterResource.getClusterClient();
	client.setDetached(true);

	// Submit the job
	JobGraph jobGraph = env.getStreamGraph().getJobGraph();

	jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));

	JobSubmissionResult jobSubmissionResult = client.submitJob(jobGraph, SavepointMigrationTestBase.class.getClassLoader());

	boolean done = false;
	while (DEADLINE.hasTimeLeft()) {

		// try and get a job result, this will fail if the job already failed. Use this
		// to get out of this loop
		JobID jobId = jobSubmissionResult.getJobID();

		try {
			CompletableFuture<JobStatus> jobStatusFuture = client.getJobStatus(jobSubmissionResult.getJobID());

			JobStatus jobStatus = jobStatusFuture.get(5, TimeUnit.SECONDS);

			assertNotEquals(JobStatus.FAILED, jobStatus);
		} catch (Exception e) {
			fail("Could not connect to job: " + e);
		}

		Thread.sleep(100);
		Map<String, OptionalFailure<Object>> accumulators = client.getAccumulators(jobId);

		boolean allDone = true;
		for (Tuple2<String, Integer> acc : expectedAccumulators) {
			OptionalFailure<Object> numFinished = accumulators.get(acc.f0);
			if (numFinished == null) {
				allDone = false;
				break;
			}
			if (!numFinished.get().equals(acc.f1)) {
				allDone = false;
				break;
			}
		}

		if (allDone) {
			done = true;
			break;
		}
	}

	if (!done) {
		fail("Did not see the expected accumulator results within time limit.");
	}
}
 
Example 20
Source File: AbstractOperatorRestoreTestBase.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
private String migrateJob(ClassLoader classLoader, ClusterClient<?> clusterClient, Deadline deadline) throws Throwable {

		URL savepointResource = AbstractOperatorRestoreTestBase.class.getClassLoader().getResource("operatorstate/" + getMigrationSavepointName());
		if (savepointResource == null) {
			throw new IllegalArgumentException("Savepoint file does not exist.");
		}
		JobGraph jobToMigrate = createJobGraph(ExecutionMode.MIGRATE);
		jobToMigrate.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointResource.getFile()));

		assertNotNull(jobToMigrate.getJobID());

		clusterClient.submitJob(jobToMigrate, classLoader);

		CompletableFuture<JobStatus> jobRunningFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.RUNNING,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.RUNNING,
			jobRunningFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		// Trigger savepoint
		File targetDirectory = tmpFolder.newFolder();
		String savepointPath = null;

		// FLINK-6918: Retry cancel with savepoint message in case that StreamTasks were not running
		// TODO: The retry logic should be removed once the StreamTask lifecycle has been fixed (see FLINK-4714)
		while (deadline.hasTimeLeft() && savepointPath == null) {
			try {
				savepointPath = clusterClient.cancelWithSavepoint(
					jobToMigrate.getJobID(),
					targetDirectory.getAbsolutePath());
			} catch (Exception e) {
				String exceptionString = ExceptionUtils.stringifyException(e);
				if (!(exceptionString.matches("(.*\n)*.*savepoint for the job .* failed(.*\n)*") // legacy
						|| exceptionString.matches("(.*\n)*.*was not running(.*\n)*")
						|| exceptionString.matches("(.*\n)*.*Not all required tasks are currently running(.*\n)*") // new
						|| exceptionString.matches("(.*\n)*.*Checkpoint was declined \\(tasks not ready\\)(.*\n)*"))) { // new
					throw e;
				}
			}
		}

		assertNotNull("Could not take savepoint.", savepointPath);

		CompletableFuture<JobStatus> jobCanceledFuture = FutureUtils.retrySuccessfulWithDelay(
			() -> clusterClient.getJobStatus(jobToMigrate.getJobID()),
			Time.milliseconds(50),
			deadline,
			(jobStatus) -> jobStatus == JobStatus.CANCELED,
			TestingUtils.defaultScheduledExecutor());
		assertEquals(
			JobStatus.CANCELED,
			jobCanceledFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));

		return savepointPath;
	}