Java Code Examples for org.apache.flink.api.java.utils.ParameterTool#get()

The following examples show how to use org.apache.flink.api.java.utils.ParameterTool#get() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: Simplify.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Override
public void configure(ParameterTool parameterTool) {
	String ordering = parameterTool.get("simplify");

	if (ordering == null) {
		value = Ordering.NONE;
	} else {
		switch (ordering.toLowerCase()) {
			case "directed":
				value = Ordering.DIRECTED;
				break;
			case "undirected":
				value = parameterTool.has("clip_and_flip") ? Ordering.UNDIRECTED_CLIP_AND_FLIP : Ordering.UNDIRECTED;
				break;
			default:
				throw new ProgramParametrizationException(
					"Expected 'directed' or 'undirected' ordering but received '" + ordering + "'");
		}
	}
}
 
Example 2
Source File: S3UtilProgram.java    From flink with Apache License 2.0 6 votes vote down vote up
private static void numberOfLinesInFilesWithFullAndNamePrefix(ParameterTool params) {
	final String bucket = params.getRequired("bucket");
	final String s3prefix = params.getRequired("s3prefix");
	final String s3filePrefix = params.get("s3filePrefix", "");
	int parallelism = params.getInt("parallelism", 10);

	List<String> files = listByFullPathPrefix(bucket, s3prefix);

	ExecutorService executor = Executors.newFixedThreadPool(parallelism);
	AmazonS3 s3client = AmazonS3ClientBuilder.defaultClient();
	List<CompletableFuture<Integer>> requests =
		submitLineCountingRequestsForFilesAsync(executor, s3client, bucket, files, s3filePrefix);
	int count = waitAndComputeTotalLineCountResult(requests);

	executor.shutdownNow();
	s3client.shutdown();
	System.out.print(count);
}
 
Example 3
Source File: StreamingNoop.java    From flink with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {
	ParameterTool params = ParameterTool.fromArgs(args);

	// define the dataflow
	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(2);
	env.setRestartStrategy(RestartStrategies.fixedDelayRestart(10, 1000));
	env.readFileStream("input/", 60000, FileMonitoringFunction.WatchType.ONLY_NEW_FILES)
		.addSink(new DiscardingSink<String>());

	// generate a job graph
	final JobGraph jobGraph = env.getStreamGraph().getJobGraph();
	File jobGraphFile = new File(params.get("output", "job.graph"));
	try (FileOutputStream output = new FileOutputStream(jobGraphFile);
		ObjectOutputStream obOutput = new ObjectOutputStream(output)){
		obOutput.writeObject(jobGraph);
	}
}
 
Example 4
Source File: S3UtilProgram.java    From flink with Apache License 2.0 6 votes vote down vote up
private static void downloadByFullPathAndFileNamePrefix(ParameterTool params) {
	final String bucket = params.getRequired("bucket");
	final String s3prefix = params.getRequired("s3prefix");
	final String localFolder = params.getRequired("localFolder");
	final String s3filePrefix = params.get("s3filePrefix", "");
	TransferManager tx = TransferManagerBuilder.defaultTransferManager();
	Predicate<String> keyPredicate = getKeyFilterByFileNamePrefix(s3filePrefix);
	KeyFilter keyFilter = s3filePrefix.isEmpty() ? KeyFilter.INCLUDE_ALL :
		objectSummary -> keyPredicate.test(objectSummary.getKey());
	try {
		tx.downloadDirectory(bucket, s3prefix, new File(localFolder), keyFilter).waitForCompletion();
	} catch (InterruptedException e) {
		System.out.println("Transfer interrupted");
	} finally {
		tx.shutdownNow();
	}
}
 
Example 5
Source File: S3UtilProgram.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
private static void downloadByFullPathAndFileNamePrefix(ParameterTool params) {
	final String bucket = params.getRequired("bucket");
	final String s3prefix = params.getRequired("s3prefix");
	final String localFolder = params.getRequired("localFolder");
	final String s3filePrefix = params.get("s3filePrefix", "");
	TransferManager tx = TransferManagerBuilder.defaultTransferManager();
	Predicate<String> keyPredicate = getKeyFilterByFileNamePrefix(s3filePrefix);
	KeyFilter keyFilter = s3filePrefix.isEmpty() ? KeyFilter.INCLUDE_ALL :
		objectSummary -> keyPredicate.test(objectSummary.getKey());
	try {
		tx.downloadDirectory(bucket, s3prefix, new File(localFolder), keyFilter).waitForCompletion();
	} catch (InterruptedException e) {
		System.out.println("Transfer interrupted");
	} finally {
		tx.shutdownNow();
	}
}
 
Example 6
Source File: DataStreamAllroundTestJobFactory.java    From flink with Apache License 2.0 5 votes vote down vote up
private static void setupCheckpointing(final StreamExecutionEnvironment env, final ParameterTool pt) {
	String semantics = pt.get(TEST_SEMANTICS.key(), TEST_SEMANTICS.defaultValue());
	long checkpointInterval = pt.getLong(ENVIRONMENT_CHECKPOINT_INTERVAL.key(), ENVIRONMENT_CHECKPOINT_INTERVAL.defaultValue());
	CheckpointingMode checkpointingMode = semantics.equalsIgnoreCase("exactly-once")
		? CheckpointingMode.EXACTLY_ONCE
		: CheckpointingMode.AT_LEAST_ONCE;

	env.enableCheckpointing(checkpointInterval, checkpointingMode);

	boolean enableExternalizedCheckpoints = pt.getBoolean(
		ENVIRONMENT_EXTERNALIZE_CHECKPOINT.key(),
		ENVIRONMENT_EXTERNALIZE_CHECKPOINT.defaultValue());

	if (enableExternalizedCheckpoints) {
		String cleanupModeConfig = pt.get(
			ENVIRONMENT_EXTERNALIZE_CHECKPOINT_CLEANUP.key(),
			ENVIRONMENT_EXTERNALIZE_CHECKPOINT_CLEANUP.defaultValue());

		CheckpointConfig.ExternalizedCheckpointCleanup cleanupMode;
		switch (cleanupModeConfig) {
			case "retain":
				cleanupMode = CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION;
				break;
			case "delete":
				cleanupMode = CheckpointConfig.ExternalizedCheckpointCleanup.DELETE_ON_CANCELLATION;
				break;
			default:
				throw new IllegalArgumentException("Unknown clean up mode for externalized checkpoints: " + cleanupModeConfig);
		}
		env.getCheckpointConfig().enableExternalizedCheckpoints(cleanupMode);

		final int tolerableDeclinedCheckpointNumber = pt.getInt(
			ENVIRONMENT_TOLERABLE_DECLINED_CHECKPOINT_NUMBER.key(),
			ENVIRONMENT_TOLERABLE_DECLINED_CHECKPOINT_NUMBER.defaultValue());
		env.getCheckpointConfig().setTolerableCheckpointFailureNumber(tolerableDeclinedCheckpointNumber);
	}
}
 
Example 7
Source File: ClickEventCount.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	final ParameterTool params = ParameterTool.fromArgs(args);

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

	configureEnvironment(params, env);

	String inputTopic = params.get("input-topic", "input");
	String outputTopic = params.get("output-topic", "output");
	String brokers = params.get("bootstrap.servers", "localhost:9092");
	Properties kafkaProps = new Properties();
	kafkaProps.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
	kafkaProps.setProperty(ConsumerConfig.GROUP_ID_CONFIG, "click-event-count");

	env.addSource(new FlinkKafkaConsumer<>(inputTopic, new ClickEventDeserializationSchema(), kafkaProps))
		.name("ClickEvent Source")
		.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<ClickEvent>(Time.of(200, TimeUnit.MILLISECONDS)) {
			@Override
			public long extractTimestamp(final ClickEvent element) {
				return element.getTimestamp().getTime();
			}
		})
		.keyBy(ClickEvent::getPage)
		.timeWindow(WINDOW_SIZE)
		.aggregate(new CountingAggregator(),
			new ClickEventStatisticsCollector())
		.name("ClickEvent Counter")
		.addSink(new FlinkKafkaProducer<>(
			outputTopic,
			new ClickEventStatisticsSerializationSchema(outputTopic),
			kafkaProps,
			FlinkKafkaProducer.Semantic.AT_LEAST_ONCE))
		.name("ClickEventStatistics Sink");

	env.execute("Click Event Count");
}
 
Example 8
Source File: PopularPlacesSolution.java    From flink-training-exercises with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

		ParameterTool params = ParameterTool.fromArgs(args);
		final String input = params.get("input", ExerciseBase.pathToRideData);
		final int popThreshold = params.getInt("threshold", 20);

		final int maxEventDelay = 60;       // events are out of order by max 60 seconds
		final int servingSpeedFactor = 600; // events of 10 minutes are served in 1 second

		// set up streaming execution environment
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
		env.setParallelism(ExerciseBase.parallelism);

		// start the data generator
		DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new TaxiRideSource(input, maxEventDelay, servingSpeedFactor)));

		// find popular places
		DataStream<Tuple5<Float, Float, Long, Boolean, Integer>> popularSpots = rides
				// remove all rides which are not within NYC
				.filter(new NYCFilter())
				// match ride to grid cell and event type (start or end)
				.map(new GridCellMatcher())
				// partition by cell id and event type
				.<KeyedStream<Tuple2<Integer, Boolean>, Tuple2<Integer, Boolean>>>keyBy(0, 1)
				// build sliding window
				.timeWindow(Time.minutes(15), Time.minutes(5))
				// count ride events in window
				.apply(new RideCounter())
				// filter by popularity threshold
				.filter((Tuple4<Integer, Long, Boolean, Integer> count) -> (count.f3 >= popThreshold))
				// map grid cell to coordinates
				.map(new GridToCoordinates());

		// print result on stdout
		printOrTest(popularSpots);

		// execute the transformation pipeline
		env.execute("Popular Places");
	}
 
Example 9
Source File: Utils.java    From flink-tutorials with Apache License 2.0 5 votes vote down vote up
public static boolean isSensitive(String key, ParameterTool params) {
	Preconditions.checkNotNull(key, "key is null");
	final String value = params.get(SENSITIVE_KEYS_KEY);
	if (value == null) { return false; }
	String keyInLower = key.toLowerCase();
	String[] sensitiveKeys = value.split(",");

	for (int i = 0; i < sensitiveKeys.length; ++i) {
		String hideKey = sensitiveKeys[i];
		if (keyInLower.length() >= hideKey.length() && keyInLower.contains(hideKey)) {
			return true;
		}
	}
	return false;
}
 
Example 10
Source File: StatefulStreamJobUpgradeTestProgram.java    From flink with Apache License 2.0 5 votes vote down vote up
private static boolean isOriginalJobVariant(final ParameterTool pt) {
	switch (pt.get(TEST_JOB_VARIANT.key())) {
		case TEST_JOB_VARIANT_ORIGINAL:
			return true;
		case TEST_JOB_VARIANT_UPGRADED:
			return false;
		default:
			throw new IllegalArgumentException(String.format("'--test.job.variant' can be either '%s' or '%s'",
				TEST_JOB_VARIANT_ORIGINAL, TEST_JOB_VARIANT_UPGRADED));
	}
}
 
Example 11
Source File: LongRidesCEPExercise.java    From flink-training-exercises with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

		ParameterTool params = ParameterTool.fromArgs(args);
		final String input = params.get("input", ExerciseBase.pathToRideData);

		final int servingSpeedFactor = 600; // events of 10 minutes are served in 1 second

		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
		env.setParallelism(ExerciseBase.parallelism);

		// CheckpointedTaxiRideSource delivers events in order
		DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new CheckpointedTaxiRideSource(input, servingSpeedFactor)));

		DataStream<TaxiRide> keyedRides = rides
			.keyBy("rideId");

		// A complete taxi ride has a START event followed by an END event
		// This pattern is incomplete ...
		Pattern<TaxiRide, TaxiRide> completedRides = Pattern.<TaxiRide>begin("start");

		// We want to find rides that have NOT been completed within 120 minutes.
		// This pattern matches rides that ARE completed.
		// Below we will ignore rides that match this pattern, and emit those that timeout.
		PatternStream<TaxiRide> patternStream = CEP.pattern(keyedRides, completedRides.within(Time.minutes(120)));

		OutputTag<TaxiRide> timedout = new OutputTag<TaxiRide>("timedout"){};

		SingleOutputStreamOperator<TaxiRide> longRides = patternStream.flatSelect(
				timedout,
				new TaxiRideTimedOut<TaxiRide>(),
				new FlatSelectNothing<TaxiRide>()
		);

		printOrTest(longRides.getSideOutput(timedout));

		throw new MissingSolutionException();

//		env.execute("Long Taxi Rides (CEP)");
	}
 
Example 12
Source File: AbstractHandler.java    From pravega-samples with Apache License 2.0 5 votes vote down vote up
public AbstractHandler(String ... args) {
    ParameterTool params = ParameterTool.fromArgs(args);
    this.scope = params.get("scope", DEFAULT_SCOPE);
    this.stream = params.get("stream", DEFAULT_STREAM);
    this.controllerUri = params.get("controllerUri", DEFAULT_CONTROLLER_URI);
    this.create = params.getBoolean("create-stream", CREATE_STREAM);
    this.limit = params.getInt("threshold", DEFAULT_POPULAR_DEST_THRESHOLD);
}
 
Example 13
Source File: TaxiQueryExercise.java    From flink-training-exercises with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

		ParameterTool params = ParameterTool.fromArgs(args);
		final String input = params.get("input", ExerciseBase.pathToRideData);

		final int maxEventDelay = 60;       	// events are out of order by at most 60 seconds
		final int servingSpeedFactor = 1800; 	// 30 minutes worth of events are served every second

		// set up streaming execution environment
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
		env.setParallelism(ExerciseBase.parallelism);

		// setup a stream of taxi rides
		DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new TaxiRideSource(input, maxEventDelay, servingSpeedFactor)));

		// add a socket source for the query stream
		BroadcastStream<String> queryStream = env
				.addSource(stringSourceOrTest(new SocketTextStreamFunction("localhost", 9999, "\n", -1)))
				.broadcast(queryDescriptor);

		// connect the two streams and process queries
		DataStream<Tuple2<String, String>> results = rides
				.keyBy((TaxiRide ride) -> ride.taxiId)
				.connect(queryStream)
				.process(new QueryProcessor());

		printOrTest(results);

		env.execute("Taxi Query");
	}
 
Example 14
Source File: ExpiringStateSolution.java    From flink-training-exercises with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

		ParameterTool params = ParameterTool.fromArgs(args);
		final String ridesFile = params.get("rides", ExerciseBase.pathToRideData);
		final String faresFile = params.get("fares", ExerciseBase.pathToFareData);

		final int maxEventDelay = 60;           // events are out of order by max 60 seconds
		final int servingSpeedFactor = 600; 	// 10 minutes worth of events are served every second

		// set up streaming execution environment
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
		env.setParallelism(ExerciseBase.parallelism);

		DataStream<TaxiRide> rides = env
				.addSource(rideSourceOrTest(new TaxiRideSource(ridesFile, maxEventDelay, servingSpeedFactor)))
				.filter((TaxiRide ride) -> (ride.isStart && (ride.rideId % 1000 != 0)))
				.keyBy(ride -> ride.rideId);

		DataStream<TaxiFare> fares = env
				.addSource(fareSourceOrTest(new TaxiFareSource(faresFile, maxEventDelay, servingSpeedFactor)))
				.keyBy(fare -> fare.rideId);

		SingleOutputStreamOperator processed = rides
				.connect(fares)
				.process(new EnrichmentFunction());

		printOrTest(processed.getSideOutput(unmatchedFares));

		env.execute("ExpiringStateSolution (java)");
	}
 
Example 15
Source File: QsStateClient.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
public static void main(final String[] args) throws Exception {

		ParameterTool parameters = ParameterTool.fromArgs(args);

		// setup values
		String jobId = parameters.getRequired("job-id");
		String host = parameters.get("host", "localhost");
		int port = parameters.getInt("port", 9069);
		int numIterations = parameters.getInt("iterations", 1500);

		QueryableStateClient client = new QueryableStateClient(host, port);
		client.setExecutionConfig(new ExecutionConfig());

		MapStateDescriptor<EmailId, EmailInformation> stateDescriptor =
				new MapStateDescriptor<>(
						QsConstants.STATE_NAME,
						TypeInformation.of(new TypeHint<EmailId>() {

						}),
						TypeInformation.of(new TypeHint<EmailInformation>() {

						})
				);

		// wait for state to exist
		for (int i = 0; i < BOOTSTRAP_RETRIES; i++) { // ~120s
			try {
				getMapState(jobId, client, stateDescriptor);
				break;
			} catch (ExecutionException e) {
				if (e.getCause() instanceof UnknownKeyOrNamespaceException) {
					System.err.println("State does not exist yet; sleeping 500ms");
					Thread.sleep(500L);
				} else {
					throw e;
				}
			}

			if (i == (BOOTSTRAP_RETRIES - 1)) {
				throw new RuntimeException("Timeout: state doesn't exist after 120s");
			}
		}

		// query state
		for (int iterations = 0; iterations < numIterations; iterations++) {

			MapState<EmailId, EmailInformation> mapState =
				getMapState(jobId, client, stateDescriptor);

			int counter = 0;
			for (Map.Entry<EmailId, EmailInformation> entry: mapState.entries()) {
				// this is to force deserialization
				entry.getKey();
				entry.getValue();
				counter++;
			}
			System.out.println("MapState has " + counter + " entries"); // we look for it in the test

			Thread.sleep(100L);
		}
	}
 
Example 16
Source File: NearestTaxiWithCleanupSolution.java    From flink-training-exercises with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

		ParameterTool params = ParameterTool.fromArgs(args);
		final String input = params.get("input", ExerciseBase.pathToRideData);

		final int maxEventDelay = 60;       	// events are out of order by at most 60 seconds
		final int servingSpeedFactor = 600; 	// 10 minutes worth of events are served every second

		// set up streaming execution environment
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

		DataStream<TaxiRide> rides = env.addSource(rideSourceOrTest(new TaxiRideSource(input, maxEventDelay, servingSpeedFactor)));

		// add a socket source
		BroadcastStream<Query> queryStream = env.socketTextStream("localhost", 9999)
				.assignTimestampsAndWatermarks(new QueryStreamAssigner())
				.map(new MapFunction<String, Query>() {
					@Override
					public Query map(String msg) throws Exception {
						String[] parts = msg.split(",\\s*");
						return new Query(
								Float.valueOf(parts[0]),	// longitude
								Float.valueOf(parts[1]));	// latitude
					}
				})
				.broadcast(queryDescriptor);

		DataStream<Tuple3<Long, Long, Float>> reports = rides
				.keyBy((TaxiRide ride) -> ride.taxiId)
				.connect(queryStream)
				.process(new QueryFunction());

		DataStream<Tuple3<Long, Long, Float>> nearest = reports
				// key by the queryId
				.keyBy(new KeySelector<Tuple3<Long, Long, Float>, Long>() {
					@Override
					public Long getKey(Tuple3<Long, Long, Float> value) throws Exception {
						return value.f0;
					}
				})
				.process(new ClosestTaxi());

		printOrTest(nearest);

		env.execute("Nearest Available Taxi");
	}
 
Example 17
Source File: DataStreamAllroundTestJobFactory.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
public static void setupEnvironment(StreamExecutionEnvironment env, ParameterTool pt) throws Exception {

		// set checkpointing semantics
		String semantics = pt.get(TEST_SEMANTICS.key(), TEST_SEMANTICS.defaultValue());
		long checkpointInterval = pt.getLong(ENVIRONMENT_CHECKPOINT_INTERVAL.key(), ENVIRONMENT_CHECKPOINT_INTERVAL.defaultValue());
		CheckpointingMode checkpointingMode = semantics.equalsIgnoreCase("exactly-once")
			? CheckpointingMode.EXACTLY_ONCE
			: CheckpointingMode.AT_LEAST_ONCE;

		env.enableCheckpointing(checkpointInterval, checkpointingMode);

		// use event time
		env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

		// parallelism
		env.setParallelism(pt.getInt(ENVIRONMENT_PARALLELISM.key(), ENVIRONMENT_PARALLELISM.defaultValue()));
		env.setMaxParallelism(pt.getInt(ENVIRONMENT_MAX_PARALLELISM.key(), ENVIRONMENT_MAX_PARALLELISM.defaultValue()));

		// restart strategy
		String restartStrategyConfig = pt.get(ENVIRONMENT_RESTART_STRATEGY.key());
		if (restartStrategyConfig != null) {
			RestartStrategies.RestartStrategyConfiguration restartStrategy;
			switch (restartStrategyConfig) {
				case "fixed_delay":
					restartStrategy = RestartStrategies.fixedDelayRestart(
						pt.getInt(
							ENVIRONMENT_RESTART_STRATEGY_FIXED_ATTEMPTS.key(),
							ENVIRONMENT_RESTART_STRATEGY_FIXED_ATTEMPTS.defaultValue()),
						pt.getLong(
							ENVIRONMENT_RESTART_STRATEGY_FIXED_DELAY.key(),
							ENVIRONMENT_RESTART_STRATEGY_FIXED_DELAY.defaultValue()));
					break;
				case "no_restart":
					restartStrategy = RestartStrategies.noRestart();
					break;
				default:
					throw new IllegalArgumentException("Unkown restart strategy: " + restartStrategyConfig);
			}
			env.setRestartStrategy(restartStrategy);
		}

		// state backend
		final String stateBackend = pt.get(
			STATE_BACKEND.key(),
			STATE_BACKEND.defaultValue());

		final String checkpointDir = pt.getRequired(STATE_BACKEND_CHECKPOINT_DIR.key());

		if ("file".equalsIgnoreCase(stateBackend)) {
			boolean asyncCheckpoints = pt.getBoolean(
				STATE_BACKEND_FILE_ASYNC.key(),
				STATE_BACKEND_FILE_ASYNC.defaultValue());

			env.setStateBackend((StateBackend) new FsStateBackend(checkpointDir, asyncCheckpoints));
		} else if ("rocks".equalsIgnoreCase(stateBackend)) {
			boolean incrementalCheckpoints = pt.getBoolean(
				STATE_BACKEND_ROCKS_INCREMENTAL.key(),
				STATE_BACKEND_ROCKS_INCREMENTAL.defaultValue());

			env.setStateBackend((StateBackend) new RocksDBStateBackend(checkpointDir, incrementalCheckpoints));
		} else {
			throw new IllegalArgumentException("Unknown backend requested: " + stateBackend);
		}

		boolean enableExternalizedCheckpoints = pt.getBoolean(
			ENVIRONMENT_EXTERNALIZE_CHECKPOINT.key(),
			ENVIRONMENT_EXTERNALIZE_CHECKPOINT.defaultValue());

		if (enableExternalizedCheckpoints) {
			String cleanupModeConfig = pt.get(
				ENVIRONMENT_EXTERNALIZE_CHECKPOINT_CLEANUP.key(),
				ENVIRONMENT_EXTERNALIZE_CHECKPOINT_CLEANUP.defaultValue());

			CheckpointConfig.ExternalizedCheckpointCleanup cleanupMode;
			switch (cleanupModeConfig) {
				case "retain":
					cleanupMode = CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION;
					break;
				case "delete":
					cleanupMode = CheckpointConfig.ExternalizedCheckpointCleanup.DELETE_ON_CANCELLATION;
					break;
				default:
					throw new IllegalArgumentException("Unknown clean up mode for externalized checkpoints: " + cleanupModeConfig);
			}

			env.getCheckpointConfig().enableExternalizedCheckpoints(cleanupMode);
		}

		// make parameters available in the web interface
		env.getConfig().setGlobalJobParameters(pt);
	}
 
Example 18
Source File: QsStateClient.java    From flink with Apache License 2.0 4 votes vote down vote up
public static void main(final String[] args) throws Exception {

		ParameterTool parameters = ParameterTool.fromArgs(args);

		// setup values
		String jobId = parameters.getRequired("job-id");
		String host = parameters.get("host", "localhost");
		int port = parameters.getInt("port", 9069);
		int numIterations = parameters.getInt("iterations", 1500);

		QueryableStateClient client = new QueryableStateClient(host, port);
		client.setExecutionConfig(new ExecutionConfig());

		MapStateDescriptor<EmailId, EmailInformation> stateDescriptor =
				new MapStateDescriptor<>(
						QsConstants.STATE_NAME,
						TypeInformation.of(new TypeHint<EmailId>() {

						}),
						TypeInformation.of(new TypeHint<EmailInformation>() {

						})
				);

		// wait for state to exist
		for (int i = 0; i < BOOTSTRAP_RETRIES; i++) { // ~120s
			try {
				getMapState(jobId, client, stateDescriptor);
				break;
			} catch (ExecutionException e) {
				if (e.getCause() instanceof UnknownKeyOrNamespaceException) {
					System.err.println("State does not exist yet; sleeping 500ms");
					Thread.sleep(500L);
				} else {
					throw e;
				}
			}

			if (i == (BOOTSTRAP_RETRIES - 1)) {
				throw new RuntimeException("Timeout: state doesn't exist after 120s");
			}
		}

		// query state
		for (int iterations = 0; iterations < numIterations; iterations++) {

			MapState<EmailId, EmailInformation> mapState =
				getMapState(jobId, client, stateDescriptor);

			int counter = 0;
			for (Map.Entry<EmailId, EmailInformation> entry: mapState.entries()) {
				// this is to force deserialization
				entry.getKey();
				entry.getValue();
				counter++;
			}
			System.out.println("MapState has " + counter + " entries"); // we look for it in the test

			Thread.sleep(100L);
		}
	}
 
Example 19
Source File: KafkaEventsGeneratorJob.java    From flink with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

		final ParameterTool params = ParameterTool.fromArgs(args);

		double errorRate = params.getDouble("error-rate", 0.0);
		int sleep = params.getInt("sleep", 1);

		String kafkaTopic = params.get("kafka-topic");
		String brokers = params.get("brokers", "localhost:9092");

		System.out.printf("Generating events to Kafka with standalone source with error rate %f and sleep delay %s millis\n", errorRate, sleep);
		System.out.println();

		final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

		env
			.addSource(new EventsGeneratorSource(errorRate, sleep))
			.addSink(new FlinkKafkaProducer<>(brokers, kafkaTopic, new EventDeSerializer()));

		// trigger program execution
		env.execute("State machine example Kafka events generator job");
	}
 
Example 20
Source File: StreamingETL.java    From flink-streaming-etl with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
	// parse arguments
	ParameterTool params = ParameterTool.fromPropertiesFile(args[0]);

	// create streaming environment
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

	// enable event time processing
	env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

	// enable fault-tolerance
	env.enableCheckpointing(1000);

	// enable restarts
	env.setRestartStrategy(RestartStrategies.fixedDelayRestart(50, 500L));

	env.setStateBackend(new FsStateBackend("file:///home/robert/flink-workdir/flink-streaming-etl/state-backend"));

	// run each operator separately
	env.disableOperatorChaining();

	// get data from Kafka
	Properties kParams = params.getProperties();
	kParams.setProperty("group.id", UUID.randomUUID().toString());
	DataStream<ObjectNode> inputStream = env.addSource(new FlinkKafkaConsumer09<>(params.getRequired("topic"), new JSONDeserializationSchema(), kParams)).name("Kafka 0.9 Source")
		.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<ObjectNode>(Time.minutes(1L)) {
			@Override
			public long extractTimestamp(ObjectNode jsonNodes) {
				return jsonNodes.get("timestamp_ms").asLong();
			}
		}).name("Timestamp extractor");

	// filter out records without lang field
	DataStream<ObjectNode> tweetsWithLang = inputStream.filter(jsonNode -> jsonNode.has("user") && jsonNode.get("user").has("lang")).name("Filter records without 'lang' field");

	// select only lang = "en" tweets
	DataStream<ObjectNode> englishTweets = tweetsWithLang.filter(jsonNode -> jsonNode.get("user").get("lang").asText().equals("en")).name("Select 'lang'=en tweets");

	// write to file system
	RollingSink<ObjectNode> rollingSink = new RollingSink<>(params.get("sinkPath", "/home/robert/flink-workdir/flink-streaming-etl/rolling-sink"));
	rollingSink.setBucketer(new DateTimeBucketer("yyyy-MM-dd-HH-mm")); // do a bucket for each minute
	englishTweets.addSink(rollingSink).name("Rolling FileSystem Sink");

	// build aggregates (count per language) using window (10 seconds tumbling):
	DataStream<Tuple3<Long, String, Long>> languageCounts = tweetsWithLang.keyBy(jsonNode -> jsonNode.get("user").get("lang").asText())
		.timeWindow(Time.seconds(10))
		.apply(new Tuple3<>(0L, "", 0L), new JsonFoldCounter(), new CountEmitter()).name("Count per Langauage (10 seconds tumbling)");

	// write window aggregate to ElasticSearch
	List<InetSocketAddress> transportNodes = ImmutableList.of(new InetSocketAddress(InetAddress.getByName("localhost"), 9300));
	ElasticsearchSink<Tuple3<Long, String, Long>> elasticsearchSink = new ElasticsearchSink<>(params.toMap(), transportNodes, new ESRequest());

	languageCounts.addSink(elasticsearchSink).name("ElasticSearch2 Sink");

	// word-count on the tweet stream
	DataStream<Tuple2<Date, List<Tuple2<String, Long>>>> topWordCount = tweetsWithLang
		// get text from tweets
		.map(tweet -> tweet.get("text").asText()).name("Get text from Tweets")
		// split text into (word, 1) tuples
		.flatMap(new FlatMapFunction<String, Tuple2<String, Long>>() {
			@Override
			public void flatMap(String s, Collector<Tuple2<String, Long>> collector) throws Exception {
				String[] splits = s.split(" ");
				for (String sp : splits) {
					collector.collect(new Tuple2<>(sp, 1L));
				}
			}
		}).name("Tokenize words")
		// group by word
		.keyBy(0)
		// build 1 min windows, compute every 10 seconds --> count word frequency
		.timeWindow(Time.minutes(1L), Time.seconds(10L)).apply(new WordCountingWindow()).name("Count word frequency (1 min, 10 sec sliding window)")
		// build top n every 10 seconds
		.timeWindowAll(Time.seconds(10L)).apply(new TopNWords(10)).name("TopN Window (10s)");

	// write top Ns to Kafka topic
	topWordCount.addSink(new FlinkKafkaProducer09<>(params.getRequired("wc-topic"), new ListSerSchema(), params.getProperties())).name("Write topN to Kafka");

	env.execute("Streaming ETL");

}