org.apache.flink.streaming.api.datastream.DataStreamUtils Java Examples

The following examples show how to use org.apache.flink.streaming.api.datastream.DataStreamUtils. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example #1

Source File: CollectITCase.java From Flink-CEPplus with Apache License 2.0

6 votes

@Test
public void testCollect() throws Exception {
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);

	final long n = 10;
	DataStream<Long> stream = env.generateSequence(1, n);

	long i = 1;
	for (Iterator<Long> it = DataStreamUtils.collect(stream); it.hasNext(); ) {
		long x = it.next();
		assertEquals("received wrong element", i, x);
		i++;
	}

	assertEquals("received wrong number of elements", n + 1, i);
}

Example #2

Source File: CollectITCase.java From flink with Apache License 2.0

6 votes

@Test
public void testCollect() throws Exception {
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);

	final long n = 10;
	DataStream<Long> stream = env.generateSequence(1, n);

	long i = 1;
	for (Iterator<Long> it = DataStreamUtils.collect(stream); it.hasNext(); ) {
		long x = it.next();
		assertEquals("received wrong element", i, x);
		i++;
	}

	assertEquals("received wrong number of elements", n + 1, i);
}

Example #3

Source File: CollectITCase.java From flink with Apache License 2.0

6 votes

@Test
public void testCollect() throws Exception {
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);

	final long n = 10;
	DataStream<Long> stream = env.generateSequence(1, n);

	long i = 1;
	for (Iterator<Long> it = DataStreamUtils.collect(stream); it.hasNext(); ) {
		long x = it.next();
		assertEquals("received wrong element", i, x);
		i++;
	}

	assertEquals("received wrong number of elements", n + 1, i);
}

Example #4

Source File: RegionFailoverITCase.java From flink with Apache License 2.0

5 votes

private JobGraph createJobGraph() {

		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(NUM_OF_REGIONS);
		env.setMaxParallelism(MAX_PARALLELISM);
		env.enableCheckpointing(200, CheckpointingMode.EXACTLY_ONCE);
		env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
		env.disableOperatorChaining();
		env.getConfig().disableSysoutLogging();

		// Use DataStreamUtils#reinterpretAsKeyed to avoid merge regions and this stream graph would exist num of 'NUM_OF_REGIONS' individual regions.
		DataStreamUtils.reinterpretAsKeyedStream(
			env.addSource(new StringGeneratingSourceFunction(NUM_ELEMENTS, NUM_ELEMENTS / NUM_OF_RESTARTS))
				.name(MULTI_REGION_SOURCE_NAME)
				.setParallelism(NUM_OF_REGIONS),
			(KeySelector<Tuple2<Integer, Integer>, Integer>) value -> value.f0,
			TypeInformation.of(Integer.class))
			.map(new FailingMapperFunction(NUM_OF_RESTARTS))
			.setParallelism(NUM_OF_REGIONS)
			.addSink(new ValidatingSink())
			.setParallelism(NUM_OF_REGIONS);

		// another stream graph totally disconnected with the above one.
		env.addSource(new StringGeneratingSourceFunction(NUM_ELEMENTS, NUM_ELEMENTS / NUM_OF_RESTARTS)).
			name(SINGLE_REGION_SOURCE_NAME).setParallelism(1)
			.map((MapFunction<Tuple2<Integer, Integer>, Object>) value -> value).setParallelism(1);

		return env.getStreamGraph().getJobGraph();
	}

Example #5

Source File: FlinkUniverse.java From stateful-functions with Apache License 2.0

5 votes

private SingleOutputStreamOperator<Message> functionOperator(
    DataStream<Message> input, Map<EgressIdentifier<?>, OutputTag<Object>> sideOutputs) {

  TypeInformation<Message> typeInfo = input.getType();

  FunctionGroupDispatchFactory operatorFactory = new FunctionGroupDispatchFactory(sideOutputs);

  return DataStreamUtils.reinterpretAsKeyedStream(input, new MessageKeySelector())
      .transform(StatefulFunctionsJobConstants.FUNCTION_OPERATOR_NAME, typeInfo, operatorFactory)
      .uid(StatefulFunctionsJobConstants.FUNCTION_OPERATOR_UID);
}

Example #6

Source File: FlinkUniverse.java From flink-statefun with Apache License 2.0

5 votes

private SingleOutputStreamOperator<Message> functionOperator(
    DataStream<Message> input, Map<EgressIdentifier<?>, OutputTag<Object>> sideOutputs) {

  TypeInformation<Message> typeInfo = input.getType();

  FunctionGroupDispatchFactory operatorFactory =
      new FunctionGroupDispatchFactory(configuration, sideOutputs);

  return DataStreamUtils.reinterpretAsKeyedStream(input, new MessageKeySelector())
      .transform(StatefulFunctionsJobConstants.FUNCTION_OPERATOR_NAME, typeInfo, operatorFactory)
      .uid(StatefulFunctionsJobConstants.FUNCTION_OPERATOR_UID);
}

Example #7

Source File: FlinkTsFileStreamSource.java From incubator-iotdb with Apache License 2.0

5 votes

public static void main(String[] args) throws IOException {
	String path = "test.tsfile";
	TsFileUtils.writeTsFile(path);
	new File(path).deleteOnExit();
	String[] filedNames = {
		QueryConstant.RESERVED_TIME,
		"device_1.sensor_1",
		"device_1.sensor_2",
		"device_1.sensor_3",
		"device_2.sensor_1",
		"device_2.sensor_2",
		"device_2.sensor_3"
	};
	TypeInformation[] typeInformations = new TypeInformation[] {
		Types.LONG,
		Types.LONG,
		Types.LONG,
		Types.LONG,
		Types.LONG,
		Types.LONG,
		Types.LONG
	};
	List<Path> paths = Arrays.stream(filedNames)
		.filter(s -> !s.equals(QueryConstant.RESERVED_TIME))
		.map(Path::new)
		.collect(Collectors.toList());
	RowTypeInfo rowTypeInfo = new RowTypeInfo(typeInformations, filedNames);
	QueryExpression queryExpression = QueryExpression.create(paths, null);
	RowRowRecordParser parser = RowRowRecordParser.create(rowTypeInfo, queryExpression.getSelectedSeries());
	TsFileInputFormat<Row> inputFormat = new TsFileInputFormat<>(queryExpression, parser);
	StreamExecutionEnvironment senv = StreamExecutionEnvironment.getExecutionEnvironment();
	inputFormat.setFilePath("source.tsfile");
	DataStream<Row> source = senv.createInput(inputFormat);
	DataStream<String> rowString = source.map(Row::toString);
	Iterator<String> result = DataStreamUtils.collect(rowString);
	while (result.hasNext()) {
		System.out.println(result.next());
	}
}

Example #8

Source File: RowTsFileInputFormatITCase.java From incubator-iotdb with Apache License 2.0

5 votes

@Test
public void testStreamExecution() throws Exception {
	// read files in a directory
	TsFileInputFormat<Row> inputFormat = prepareInputFormat(tmpDir);
	DataStream<Row> source = senv.createInput(inputFormat);
	Iterator<String> rowStringIterator = DataStreamUtils.collect(source.map(Row::toString));
	String[] result = StreamSupport.stream(
		Spliterators.spliteratorUnknownSize(rowStringIterator, 0),
		false).sorted().toArray(String[]::new);
	String[] expected = {
		"1,1.2,20,null,2.3,11,19",
		"10,null,20,50,25.4,10,21",
		"11,1.4,21,null,null,null,null",
		"12,1.2,20,51,null,null,null",
		"14,7.2,10,11,null,null,null",
		"15,6.2,20,21,null,null,null",
		"16,9.2,30,31,null,null,null",
		"2,null,20,50,25.4,10,21",
		"3,1.4,21,null,null,null,null",
		"4,1.2,20,51,null,null,null",
		"6,7.2,10,11,null,null,null",
		"7,6.2,20,21,null,null,null",
		"8,9.2,30,31,null,null,null",
		"9,1.2,20,null,2.3,11,19"
	};
	assertArrayEquals(expected, result);
}

Example #9

Source File: SimpleFlinkKafkaStreamingCounterDataSourceAVRO.java From jMetalSP with MIT License

5 votes

@Override
public void run() {

    JMetalLogger.logger.info("Run Fink method in the streaming data source invoked") ;
    JMetalLogger.logger.info("TOPIC: " + topic) ;

   // environment.getConfig().setRestartStrategy(RestartStrategies.fixedDelayRestart(1,0));
    //environment.enableCheckpointing(10);

    DataStream<String> data= environment.addSource(
            new FlinkKafkaConsumer010<String>(topic, new SimpleStringSchema(),kafkaParams));
    /**
     * DataDeserializer<Counter> dataDeserializer = new DataDeserializer<>();
     *             //Object o =dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc");
     *             //GenericData.Record rc=(GenericData.Record)o;
     *             Counter counter = dataDeserializer.deserialize(value.value(),"avsc/Counter.avsc");
     */

    try {
        if (data != null) {
            Iterator<String> it = DataStreamUtils.collect(data);

            while (it.hasNext()) {
                byte [] bytes= it.next().getBytes();
                DataDeserializer<Counter> dataDeserializer = new DataDeserializer<>();
                Counter counter = dataDeserializer.deserialize(bytes,"avsc/Counter.avsc");
                Integer number = (Integer) counter.get(0);
                observable.setChanged();
                observable.notifyObservers(new ObservedValue<Integer>(number));
            }

        }
    }catch(Exception e){
        e.printStackTrace();
    }


}

Example #10

Source File: RegionFailoverITCase.java From flink with Apache License 2.0

5 votes

private JobGraph createJobGraph() {

		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
		env.setParallelism(NUM_OF_REGIONS);
		env.setMaxParallelism(MAX_PARALLELISM);
		env.enableCheckpointing(200, CheckpointingMode.EXACTLY_ONCE);
		env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
		env.disableOperatorChaining();

		// Use DataStreamUtils#reinterpretAsKeyed to avoid merge regions and this stream graph would exist num of 'NUM_OF_REGIONS' individual regions.
		DataStreamUtils.reinterpretAsKeyedStream(
			env.addSource(new StringGeneratingSourceFunction(NUM_ELEMENTS, NUM_ELEMENTS / NUM_OF_RESTARTS))
				.name(MULTI_REGION_SOURCE_NAME)
				.setParallelism(NUM_OF_REGIONS),
			(KeySelector<Tuple2<Integer, Integer>, Integer>) value -> value.f0,
			TypeInformation.of(Integer.class))
			.map(new FailingMapperFunction(NUM_OF_RESTARTS))
			.setParallelism(NUM_OF_REGIONS)
			.addSink(new ValidatingSink())
			.setParallelism(NUM_OF_REGIONS);

		// another stream graph totally disconnected with the above one.
		env.addSource(new StringGeneratingSourceFunction(NUM_ELEMENTS, NUM_ELEMENTS / NUM_OF_RESTARTS)).
			name(SINGLE_REGION_SOURCE_NAME).setParallelism(1)
			.map((MapFunction<Tuple2<Integer, Integer>, Object>) value -> value).setParallelism(1);

		return env.getStreamGraph().getJobGraph();
	}

Example #11

Source File: ReinterpretDataStreamAsKeyedStreamITCase.java From flink with Apache License 2.0

5 votes

/**
 * This test checks that reinterpreting a data stream to a keyed stream works as expected. This test consists of
 * two jobs. The first job materializes a keyBy into files, one files per partition. The second job opens the
 * files created by the first jobs as sources (doing the correct assignment of files to partitions) and
 * reinterprets the sources as keyed, because we know they have been partitioned in a keyBy from the first job.
 */
@Test
public void testReinterpretAsKeyedStream() throws Exception {

	final int maxParallelism = 8;
	final int numEventsPerInstance = 100;
	final int parallelism = 3;
	final int numTotalEvents = numEventsPerInstance * parallelism;
	final int numUniqueKeys = 100;

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
	env.setMaxParallelism(maxParallelism);
	env.setParallelism(parallelism);
	env.enableCheckpointing(100);
	env.setRestartStrategy(RestartStrategies.fixedDelayRestart(1, 0L));

	final List<File> partitionFiles = new ArrayList<>(parallelism);
	for (int i = 0; i < parallelism; ++i) {
		File partitionFile = temporaryFolder.newFile();
		partitionFiles.add(i, partitionFile);
	}

	env.addSource(new RandomTupleSource(numEventsPerInstance, numUniqueKeys))
		.keyBy(0)
		.addSink(new ToPartitionFileSink(partitionFiles));

	env.execute();

	DataStreamUtils.reinterpretAsKeyedStream(
		env.addSource(new FromPartitionFileSource(partitionFiles)),
		(KeySelector<Tuple2<Integer, Integer>, Integer>) value -> value.f0,
		TypeInformation.of(Integer.class))
		.timeWindow(Time.seconds(1)) // test that also timers and aggregated state work as expected
		.reduce((ReduceFunction<Tuple2<Integer, Integer>>) (value1, value2) ->
			new Tuple2<>(value1.f0, value1.f1 + value2.f1))
		.addSink(new ValidatingSink(numTotalEvents)).setParallelism(1);

	env.execute();
}

Example #12

Source File: DataStreamConversionUtilTest.java From flink with Apache License 2.0

5 votes

@Test
public void testBasicConvert() throws Exception {
	StreamExecutionEnvironment env = MLEnvironmentFactory.getDefault().getStreamExecutionEnvironment();
	DataStream<Row> input = env.fromElements(Row.of("a"));
	Table table1 = DataStreamConversionUtil.toTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, input, new String[]{"word"});
	Assert.assertEquals(
		new TableSchema(new String[]{"word"}, new TypeInformation[]{TypeInformation.of(String.class)}),
		table1.getSchema()
	);
	DataStream<Row> rowDataStream = DataStreamConversionUtil.fromTable(MLEnvironmentFactory.DEFAULT_ML_ENVIRONMENT_ID, table1);
	Iterator<Row> result = DataStreamUtils.collect(rowDataStream);
	Assert.assertEquals(Row.of("a"), result.next());
	Assert.assertFalse(result.hasNext());
}

Example #13

Source File: TestUserAgentAnalysisMapperClass.java From yauaa with Apache License 2.0

4 votes

@Test
public void testClassDefinitionDataStream() throws Exception {
    StreamExecutionEnvironment environment = LocalStreamEnvironment.getExecutionEnvironment();

    DataStream<TestRecord> resultDataStream = environment
        .fromElements(
            "Mozilla/5.0 (X11; Linux x86_64) " +
                "AppleWebKit/537.36 (KHTML, like Gecko) " +
                "Chrome/48.0.2564.82 Safari/537.36",

            "Mozilla/5.0 (Linux; Android 7.0; Nexus 6 Build/NBD90Z) " +
                "AppleWebKit/537.36 (KHTML, like Gecko) " +
                "Chrome/53.0.2785.124 Mobile Safari/537.36"
        )

        .map((MapFunction<String, TestRecord>) TestRecord::new)

        .map(new MyUserAgentAnalysisMapper());

    List<TestRecord> result = new ArrayList<>(5);
    DataStreamUtils
        .collect(resultDataStream)
        .forEachRemaining(result::add);

    assertEquals(2, result.size());

    assertThat(result, hasItems(
        new TestRecord(
            "Mozilla/5.0 (X11; Linux x86_64) " +
                "AppleWebKit/537.36 (KHTML, like Gecko) " +
                "Chrome/48.0.2564.82 Safari/537.36",
            "Desktop",
            "Chrome 48.0.2564.82",
            null),

        new TestRecord(
            "Mozilla/5.0 (Linux; Android 7.0; Nexus 6 Build/NBD90Z) " +
                "AppleWebKit/537.36 (KHTML, like Gecko) " +
                "Chrome/53.0.2785.124 Mobile Safari/537.36",
            "Phone",
            "Chrome 53.0.2785.124",
            null)
    ));
}

Example #14

Source File: FlinkKafkaShuffle.java From flink with Apache License 2.0

4 votes

/**
 * The read side of {@link FlinkKafkaShuffle#persistentKeyBy}.
 *
 * <p>Each consumer task should read kafka partitions equal to the key group indices it is assigned.
 * The number of kafka partitions is the maximum parallelism of the consumer.
 * This version only supports numberOfPartitions = consumerParallelism.
 * In the case of using {@link TimeCharacteristic#EventTime}, a consumer task is responsible to emit
 * watermarks. Watermarks are read from the corresponding Kafka partitions. Notice that a consumer task only starts
 * to emit a watermark after receiving at least one watermark from each producer task to make sure watermarks
 * are monotonically increasing. Hence a consumer task needs to know `producerParallelism` as well.
 *
 * <p>Attention: make sure kafkaProperties include
 * {@link FlinkKafkaShuffle#PRODUCER_PARALLELISM} and {@link FlinkKafkaShuffle#PARTITION_NUMBER} explicitly.
 * {@link FlinkKafkaShuffle#PRODUCER_PARALLELISM} is the parallelism of the producer.
 * {@link FlinkKafkaShuffle#PARTITION_NUMBER} is the number of partitions.
 * They are not necessarily the same and allowed to be set independently.
 *
 * @see FlinkKafkaShuffle#persistentKeyBy
 * @see FlinkKafkaShuffle#writeKeyBy
 *
 * @param topic 			The topic of Kafka where data is persisted
 * @param env 				Execution environment. readKeyBy's environment can be different from writeKeyBy's
 * @param typeInformation 	Type information of the data persisted in Kafka
 * @param kafkaProperties 	kafka properties for Kafka Consumer
 * @param keySelector 		key selector to retrieve key
 * @param <T> 				Schema type
 * @param <K> 				Key type
 * @return Keyed data stream
 */
public static <T, K> KeyedStream<T, K> readKeyBy(
		String topic,
		StreamExecutionEnvironment env,
		TypeInformation<T> typeInformation,
		Properties kafkaProperties,
		KeySelector<T, K> keySelector) {

	TypeSerializer<T> typeSerializer = typeInformation.createSerializer(env.getConfig());
	TypeInformationSerializationSchema<T> schema =
		new TypeInformationSerializationSchema<>(typeInformation, typeSerializer);

	SourceFunction<T> kafkaConsumer  =
		new FlinkKafkaShuffleConsumer<>(topic, schema, typeSerializer, kafkaProperties);

	// TODO: consider situations where numberOfPartitions != consumerParallelism
	Preconditions.checkArgument(
		kafkaProperties.getProperty(PARTITION_NUMBER) != null,
		"Missing partition number for Kafka Shuffle");
	int numberOfPartitions = PropertiesUtil.getInt(kafkaProperties, PARTITION_NUMBER, Integer.MIN_VALUE);
	DataStream<T> outputDataStream = env.addSource(kafkaConsumer).setParallelism(numberOfPartitions);

	return DataStreamUtils.reinterpretAsKeyedStream(outputDataStream, keySelector);
}

Example #15

Source File: SimpleFlinkKafkaStreamingCounterDataSource.java From jMetalSP with MIT License

3 votes

@Override
public void run() {

    JMetalLogger.logger.info("Run Fink method in the streaming data source invoked") ;
    JMetalLogger.logger.info("TOPIC: " + topic) ;

   // environment.getConfig().setRestartStrategy(RestartStrategies.fixedDelayRestart(1,0));
    //environment.enableCheckpointing(10);

    DataStream<String> data= environment.addSource(
            new FlinkKafkaConsumer010<String>(topic, new SimpleStringSchema(),kafkaParams));




    try {
        Iterator<String> it=DataStreamUtils.collect(data);
        while (it.hasNext()){
            Integer number = Integer.parseInt(it.next());
            //Integer number = it.next();
            observable.setChanged();
            observable.notifyObservers(new ObservedValue<Integer>(number));
        }

    } catch (Exception e){
        e.printStackTrace();
    }


}