Java Code Examples for org.apache.flink.streaming.api.datastream.DataStream#flatMap()

The following examples show how to use org.apache.flink.streaming.api.datastream.DataStream#flatMap() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RescalingITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private static JobGraph createJobGraphWithKeyedState(
		int parallelism,
		int maxParallelism,
		int numberKeys,
		int numberElements,
		boolean terminateAfterEmission,
		int checkpointingInterval) {

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(parallelism);
	if (0 < maxParallelism) {
		env.getConfig().setMaxParallelism(maxParallelism);
	}
	env.enableCheckpointing(checkpointingInterval);
	env.setRestartStrategy(RestartStrategies.noRestart());
	env.getConfig().setUseSnapshotCompression(true);

	DataStream<Integer> input = env.addSource(new SubtaskIndexSource(
			numberKeys,
			numberElements,
			terminateAfterEmission))
			.keyBy(new KeySelector<Integer, Integer>() {
				private static final long serialVersionUID = -7952298871120320940L;

				@Override
				public Integer getKey(Integer value) throws Exception {
					return value;
				}
			});

	SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys);

	DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements));

	result.addSink(new CollectionSink<Tuple2<Integer, Integer>>());

	return env.getStreamGraph().getJobGraph();
}
 
Example 2
Source File: TwitterIntoKafka.java    From flink-streaming-etl with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {
	// set up the streaming execution environment
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

	ParameterTool params = ParameterTool.fromPropertiesFile(args[0]);
	DataStream<String> twitterStreamString = env.addSource(new TwitterSource(params.getProperties()));
	DataStream<String> filteredStream = twitterStreamString.flatMap(new ParseJson());
	filteredStream.flatMap(new ThroughputLogger(5000L)).setParallelism(1);

	filteredStream.addSink(new FlinkKafkaProducer09<>("twitter", new SimpleStringSchema(), params.getProperties()));

	// execute program
	env.execute("Ingest data from Twitter to Kafka");
}
 
Example 3
Source File: SideStream.java    From alchemy with Apache License 2.0 5 votes vote down vote up
public static DataStream<Row> buildStream(StreamTableEnvironment env, SqlSelect sqlSelect, Alias leftAlias,
    Alias sideAlias, SourceDescriptor sideSource) throws Exception {
    SqlSelect leftSelect = SideParser.newSelect(sqlSelect, leftAlias.getTable(), leftAlias.getAlias(), true, false);
    // register leftTable
    Table leftTable = env.sqlQuery(leftSelect.toString());
    DataStream<Row> leftStream = env.toAppendStream(leftTable, Row.class);
    SqlSelect rightSelect
        = SideParser.newSelect(sqlSelect, sideAlias.getTable(), sideAlias.getAlias(), false, false);
    SqlJoin sqlJoin = (SqlJoin)sqlSelect.getFrom();
    List<String> equalFields = SideParser.findConditionFields(sqlJoin.getCondition(), leftAlias.getAlias());
    if (sideSource.getSide().isPartition()) {
        leftStream = leftStream.keyBy(equalFields.toArray(new String[equalFields.size()]));
    }
    RowTypeInfo sideType = createSideType(rightSelect.getSelectList(), sideSource.getSchema());
    RowTypeInfo returnType = createReturnType(leftTable.getSchema(), sideType);
    SideTable sideTable = createSideTable(leftTable.getSchema(), sideType, sqlJoin.getJoinType(), rightSelect,
        equalFields, sideAlias, sideSource.getSide());
    DataStream<Row> returnStream;
    if (sideSource.getSide().isAsync()) {
        AbstractAsyncSideFunction reqRow = sideSource.transform(sideTable);
        returnStream = AsyncDataStream.orderedWait(leftStream, reqRow, sideSource.getSide().getTimeout(),
            TimeUnit.MILLISECONDS, sideSource.getSide().getCapacity());
    } else {
        AbstractSyncSideFunction syncReqRow = sideSource.transform(sideTable);
        returnStream = leftStream.flatMap(syncReqRow);
    }
    returnStream.getTransformation().setOutputType(returnType);
    return returnStream;
}
 
Example 4
Source File: RescalingITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private static JobGraph createJobGraphWithKeyedState(
		int parallelism,
		int maxParallelism,
		int numberKeys,
		int numberElements,
		boolean terminateAfterEmission,
		int checkpointingInterval) {

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(parallelism);
	if (0 < maxParallelism) {
		env.getConfig().setMaxParallelism(maxParallelism);
	}
	env.enableCheckpointing(checkpointingInterval);
	env.setRestartStrategy(RestartStrategies.noRestart());
	env.getConfig().setUseSnapshotCompression(true);

	DataStream<Integer> input = env.addSource(new SubtaskIndexSource(
			numberKeys,
			numberElements,
			terminateAfterEmission))
			.keyBy(new KeySelector<Integer, Integer>() {
				private static final long serialVersionUID = -7952298871120320940L;

				@Override
				public Integer getKey(Integer value) throws Exception {
					return value;
				}
			});

	SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys);

	DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements));

	result.addSink(new CollectionSink<Tuple2<Integer, Integer>>());

	return env.getStreamGraph().getJobGraph();
}
 
Example 5
Source File: StreamingOperatorsITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testOperatorChainWithObjectReuseAndNoOutputOperators() throws Exception {
	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.getConfig().enableObjectReuse();
	DataStream<Integer> input = env.fromElements(1, 2, 3);
	input.flatMap(new FlatMapFunction<Integer, Integer>() {
		@Override
		public void flatMap(Integer value, Collector<Integer> out) throws Exception {
			out.collect(value << 1);
		}
	});
	env.execute();
}
 
Example 6
Source File: RescalingITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private static JobGraph createJobGraphWithKeyedAndNonPartitionedOperatorState(
		int parallelism,
		int maxParallelism,
		int fixedParallelism,
		int numberKeys,
		int numberElements,
		boolean terminateAfterEmission,
		int checkpointingInterval) {

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(parallelism);
	env.getConfig().setMaxParallelism(maxParallelism);
	env.enableCheckpointing(checkpointingInterval);
	env.setRestartStrategy(RestartStrategies.noRestart());

	DataStream<Integer> input = env.addSource(new SubtaskIndexNonPartitionedStateSource(
			numberKeys,
			numberElements,
			terminateAfterEmission))
			.setParallelism(fixedParallelism)
			.keyBy(new KeySelector<Integer, Integer>() {
				private static final long serialVersionUID = -7952298871120320940L;

				@Override
				public Integer getKey(Integer value) throws Exception {
					return value;
				}
			});

	SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys);

	DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements));

	result.addSink(new CollectionSink<Tuple2<Integer, Integer>>());

	return env.getStreamGraph().getJobGraph();
}
 
Example 7
Source File: RescalingITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
private static JobGraph createJobGraphWithKeyedAndNonPartitionedOperatorState(
		int parallelism,
		int maxParallelism,
		int fixedParallelism,
		int numberKeys,
		int numberElements,
		boolean terminateAfterEmission,
		int checkpointingInterval) {

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(parallelism);
	env.getConfig().setMaxParallelism(maxParallelism);
	env.enableCheckpointing(checkpointingInterval);
	env.setRestartStrategy(RestartStrategies.noRestart());

	DataStream<Integer> input = env.addSource(new SubtaskIndexNonPartitionedStateSource(
			numberKeys,
			numberElements,
			terminateAfterEmission))
			.setParallelism(fixedParallelism)
			.keyBy(new KeySelector<Integer, Integer>() {
				private static final long serialVersionUID = -7952298871120320940L;

				@Override
				public Integer getKey(Integer value) throws Exception {
					return value;
				}
			});

	SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys);

	DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements));

	result.addSink(new CollectionSink<Tuple2<Integer, Integer>>());

	return env.getStreamGraph().getJobGraph();
}
 
Example 8
Source File: KafkaConsumerTestBase.java    From flink with Apache License 2.0 5 votes vote down vote up
/**
 * Test that ensures that DeserializationSchema.isEndOfStream() is properly evaluated.
 *
 * @throws Exception
 */
public void runEndOfStreamTest() throws Exception {

	final int elementCount = 300;
	final String topic = writeSequence("testEndOfStream", elementCount, 1, 1);

	// read using custom schema
	final StreamExecutionEnvironment env1 = StreamExecutionEnvironment.getExecutionEnvironment();
	env1.setParallelism(1);
	env1.getConfig().setRestartStrategy(RestartStrategies.noRestart());
	env1.getConfig().disableSysoutLogging();

	Properties props = new Properties();
	props.putAll(standardProps);
	props.putAll(secureProps);

	DataStream<Tuple2<Integer, Integer>> fromKafka = env1.addSource(kafkaServer.getConsumer(topic, new FixedNumberDeserializationSchema(elementCount), props));
	fromKafka.flatMap(new FlatMapFunction<Tuple2<Integer, Integer>, Void>() {
		@Override
		public void flatMap(Tuple2<Integer, Integer> value, Collector<Void> out) throws Exception {
			// noop ;)
		}
	});

	tryExecute(env1, "Consume " + elementCount + " elements from Kafka");

	deleteTestTopic(topic);
}
 
Example 9
Source File: StreamingOperatorsITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testOperatorChainWithObjectReuseAndNoOutputOperators() throws Exception {
	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.getConfig().enableObjectReuse();
	DataStream<Integer> input = env.fromElements(1, 2, 3);
	input.flatMap(new FlatMapFunction<Integer, Integer>() {
		@Override
		public void flatMap(Integer value, Collector<Integer> out) throws Exception {
			out.collect(value << 1);
		}
	});
	env.execute();
}
 
Example 10
Source File: RescalingITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
private static JobGraph createJobGraphWithKeyedAndNonPartitionedOperatorState(
		int parallelism,
		int maxParallelism,
		int fixedParallelism,
		int numberKeys,
		int numberElements,
		boolean terminateAfterEmission,
		int checkpointingInterval) {

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(parallelism);
	env.getConfig().setMaxParallelism(maxParallelism);
	env.enableCheckpointing(checkpointingInterval);
	env.setRestartStrategy(RestartStrategies.noRestart());

	DataStream<Integer> input = env.addSource(new SubtaskIndexNonPartitionedStateSource(
			numberKeys,
			numberElements,
			terminateAfterEmission))
			.setParallelism(fixedParallelism)
			.keyBy(new KeySelector<Integer, Integer>() {
				private static final long serialVersionUID = -7952298871120320940L;

				@Override
				public Integer getKey(Integer value) throws Exception {
					return value;
				}
			});

	SubtaskIndexFlatMapper.workCompletedLatch = new CountDownLatch(numberKeys);

	DataStream<Tuple2<Integer, Integer>> result = input.flatMap(new SubtaskIndexFlatMapper(numberElements));

	result.addSink(new CollectionSink<Tuple2<Integer, Integer>>());

	return env.getStreamGraph().getJobGraph();
}
 
Example 11
Source File: KafkaConsumerTestBase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
/**
 * Test that ensures that DeserializationSchema.isEndOfStream() is properly evaluated.
 *
 * @throws Exception
 */
public void runEndOfStreamTest() throws Exception {

	final int elementCount = 300;
	final String topic = writeSequence("testEndOfStream", elementCount, 1, 1);

	// read using custom schema
	final StreamExecutionEnvironment env1 = StreamExecutionEnvironment.getExecutionEnvironment();
	env1.setParallelism(1);
	env1.getConfig().setRestartStrategy(RestartStrategies.noRestart());
	env1.getConfig().disableSysoutLogging();

	Properties props = new Properties();
	props.putAll(standardProps);
	props.putAll(secureProps);

	DataStream<Tuple2<Integer, Integer>> fromKafka = env1.addSource(kafkaServer.getConsumer(topic, new FixedNumberDeserializationSchema(elementCount), props));
	fromKafka.flatMap(new FlatMapFunction<Tuple2<Integer, Integer>, Void>() {
		@Override
		public void flatMap(Tuple2<Integer, Integer> value, Collector<Void> out) throws Exception {
			// noop ;)
		}
	});

	tryExecute(env1, "Consume " + elementCount + " elements from Kafka");

	deleteTestTopic(topic);
}
 
Example 12
Source File: StreamingOperatorsITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testOperatorChainWithObjectReuseAndNoOutputOperators() throws Exception {
	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.getConfig().enableObjectReuse();
	DataStream<Integer> input = env.fromElements(1, 2, 3);
	input.flatMap(new FlatMapFunction<Integer, Integer>() {
		@Override
		public void flatMap(Integer value, Collector<Integer> out) throws Exception {
			out.collect(value << 1);
		}
	});
	env.execute();
}
 
Example 13
Source File: KafkaConsumerTestBase.java    From flink with Apache License 2.0 4 votes vote down vote up
public void runKeyValueTest() throws Exception {
	final String topic = "keyvaluetest";
	createTestTopic(topic, 1, 1);
	final int elementCount = 5000;

	// ----------- Write some data into Kafka -------------------

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.setRestartStrategy(RestartStrategies.noRestart());

	DataStream<Tuple2<Long, PojoValue>> kvStream = env.addSource(new SourceFunction<Tuple2<Long, PojoValue>>() {
		@Override
		public void run(SourceContext<Tuple2<Long, PojoValue>> ctx) throws Exception {
			Random rnd = new Random(1337);
			for (long i = 0; i < elementCount; i++) {
				PojoValue pojo = new PojoValue();
				pojo.when = new Date(rnd.nextLong());
				pojo.lon = rnd.nextLong();
				pojo.lat = i;
				// make every second key null to ensure proper "null" serialization
				Long key = (i % 2 == 0) ? null : i;
				ctx.collect(new Tuple2<>(key, pojo));
			}
		}

		@Override
		public void cancel() {
		}
	});

	KeyedSerializationSchema<Tuple2<Long, PojoValue>> schema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig());
	Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
	producerProperties.setProperty("retries", "3");
	kafkaServer.produceIntoKafka(kvStream, topic, schema, producerProperties, null);
	env.execute("Write KV to Kafka");

	// ----------- Read the data again -------------------

	env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.setRestartStrategy(RestartStrategies.noRestart());

	KafkaDeserializationSchema<Tuple2<Long, PojoValue>> readSchema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig());

	Properties props = new Properties();
	props.putAll(standardProps);
	props.putAll(secureProps);
	DataStream<Tuple2<Long, PojoValue>> fromKafka = env.addSource(kafkaServer.getConsumer(topic, readSchema, props));
	fromKafka.flatMap(new RichFlatMapFunction<Tuple2<Long, PojoValue>, Object>() {
		long counter = 0;
		@Override
		public void flatMap(Tuple2<Long, PojoValue> value, Collector<Object> out) throws Exception {
			// the elements should be in order.
			Assert.assertTrue("Wrong value " + value.f1.lat, value.f1.lat == counter);
			if (value.f1.lat % 2 == 0) {
				assertNull("key was not null", value.f0);
			} else {
				Assert.assertTrue("Wrong value " + value.f0, value.f0 == counter);
			}
			counter++;
			if (counter == elementCount) {
				// we got the right number of elements
				throw new SuccessException();
			}
		}
	});

	tryExecute(env, "Read KV from Kafka");

	deleteTestTopic(topic);
}
 
Example 14
Source File: KafkaConsumerTestBase.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
public void runKeyValueTest() throws Exception {
	final String topic = "keyvaluetest";
	createTestTopic(topic, 1, 1);
	final int elementCount = 5000;

	// ----------- Write some data into Kafka -------------------

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.setRestartStrategy(RestartStrategies.noRestart());
	env.getConfig().disableSysoutLogging();

	DataStream<Tuple2<Long, PojoValue>> kvStream = env.addSource(new SourceFunction<Tuple2<Long, PojoValue>>() {
		@Override
		public void run(SourceContext<Tuple2<Long, PojoValue>> ctx) throws Exception {
			Random rnd = new Random(1337);
			for (long i = 0; i < elementCount; i++) {
				PojoValue pojo = new PojoValue();
				pojo.when = new Date(rnd.nextLong());
				pojo.lon = rnd.nextLong();
				pojo.lat = i;
				// make every second key null to ensure proper "null" serialization
				Long key = (i % 2 == 0) ? null : i;
				ctx.collect(new Tuple2<>(key, pojo));
			}
		}

		@Override
		public void cancel() {
		}
	});

	KeyedSerializationSchema<Tuple2<Long, PojoValue>> schema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig());
	Properties producerProperties = FlinkKafkaProducerBase.getPropertiesFromBrokerList(brokerConnectionStrings);
	producerProperties.setProperty("retries", "3");
	kafkaServer.produceIntoKafka(kvStream, topic, schema, producerProperties, null);
	env.execute("Write KV to Kafka");

	// ----------- Read the data again -------------------

	env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.setRestartStrategy(RestartStrategies.noRestart());
	env.getConfig().disableSysoutLogging();

	KafkaDeserializationSchema<Tuple2<Long, PojoValue>> readSchema = new TypeInformationKeyValueSerializationSchema<>(Long.class, PojoValue.class, env.getConfig());

	Properties props = new Properties();
	props.putAll(standardProps);
	props.putAll(secureProps);
	DataStream<Tuple2<Long, PojoValue>> fromKafka = env.addSource(kafkaServer.getConsumer(topic, readSchema, props));
	fromKafka.flatMap(new RichFlatMapFunction<Tuple2<Long, PojoValue>, Object>() {
		long counter = 0;
		@Override
		public void flatMap(Tuple2<Long, PojoValue> value, Collector<Object> out) throws Exception {
			// the elements should be in order.
			Assert.assertTrue("Wrong value " + value.f1.lat, value.f1.lat == counter);
			if (value.f1.lat % 2 == 0) {
				assertNull("key was not null", value.f0);
			} else {
				Assert.assertTrue("Wrong value " + value.f0, value.f0 == counter);
			}
			counter++;
			if (counter == elementCount) {
				// we got the right number of elements
				throw new SuccessException();
			}
		}
	});

	tryExecute(env, "Read KV from Kafka");

	deleteTestTopic(topic);
}
 
Example 15
Source File: Driver.java    From OSTMap with Apache License 2.0 4 votes vote down vote up
public void run(String pathToTwitterProperties, String pathToAccumuloProperties, ArrayList<String> tweet) throws Exception
{

    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
    // one watermark each ten second
    env.getConfig().setAutoWatermarkInterval(1000);

    // decide which stream source should be used
    DataStream<String> geoStream;
    if(tweet==null) {
        geoStream = env.addSource(new GeoTwitterSource(pathToTwitterProperties));
    }
    else {
        geoStream = env.fromCollection ( tweet );
    }

    // decide which configuration should be used
    RawTwitterDataSink rtdSink = new RawTwitterDataSink();
    TermIndexSink tiSink = new TermIndexSink();
    LanguageFrequencySink frqSink = new LanguageFrequencySink();
    GeoTemporalIndexSink gtiSink = new GeoTemporalIndexSink();
    SinkConfiguration sc;
    if(runOnMAC) {
        sc = SinkConfiguration.createConfigForMinicluster(accumuloInstanceName, accumuloZookeeper);
    }
    else {
        sc = SinkConfiguration.createConfigFromFile(pathToAccumuloProperties);
    }
    rtdSink.configure(sc, TableIdentifier.RAW_TWITTER_DATA.get());
    tiSink.configure(sc, TableIdentifier.TERM_INDEX.get());
    frqSink.configure(sc, TableIdentifier.TWEET_FREQUENCY.get());
    gtiSink.configure(sc, TableIdentifier.GEO_TEMPORAL_INDEX.get());


    // stream of tuples containing timestamp and tweet's json-String
    DataStream<Tuple2<Long, String>> dateStream = geoStream.flatMap(new DateExtraction());

    dateStream
            .flatMap(new LanguageFrequencyRowExtraction())
            .flatMap(new LanguageTagExtraction())
            .assignTimestampsAndWatermarks(new TimestampExtractorForDateStream())
            .windowAll(TumblingEventTimeWindows.of(Time.minutes(1)))
            .apply (new AllWindowFunctionLangFreq())
            .addSink(frqSink);

    // stream of tuples containing RawTwitterDataKey and tweet's json-String
    DataStream<Tuple2<RawTwitterDataKey, String>> rtdStream = dateStream.flatMap(new CalculateRawTwitterDataKey());

    /** write into rawTwitterData-table */
    rtdStream.addSink(rtdSink);

    /** write into geoTemporalIndex-table */
    rtdStream
            .flatMap(new GeoTemporalKeyExtraction())
            .addSink(gtiSink);

    /** write into termIndex-table */
    // processing for user
    rtdStream
            .flatMap(new UserExtraction())
            .addSink(tiSink);
    // processing for terms
    rtdStream
            .flatMap(new TermExtraction())
            .addSink(tiSink);

    env.execute("twitter stream");
}
 
Example 16
Source File: CsvSourceStreamOp.java    From Alink with Apache License 2.0 4 votes vote down vote up
@Override
public Table initializeDataSource() {
    final String filePath = getFilePath();
    final String schemaStr = getSchemaStr();
    final String fieldDelim = getFieldDelimiter();
    final String rowDelim = getRowDelimiter();
    final Character quoteChar = getQuoteChar();
    final boolean skipBlankLine = getSkipBlankLine();

    final String[] colNames = CsvUtil.getColNames(schemaStr);
    final TypeInformation[] colTypes = CsvUtil.getColTypes(schemaStr);

    boolean ignoreFirstLine = getIgnoreFirstLine();
    String protocol = "";

    try {
        URL url = new URL(filePath);
        protocol = url.getProtocol();
    } catch (MalformedURLException ignored) {
    }

    DataStream<Row> rows;
    StreamExecutionEnvironment execEnv =
        MLEnvironmentFactory.get(getMLEnvironmentId()).getStreamExecutionEnvironment();
    TableSchema dummySchema = new TableSchema(new String[]{"f1"}, new TypeInformation[]{Types.STRING});

    if (protocol.equalsIgnoreCase("http") || protocol.equalsIgnoreCase("https")) {
        HttpFileSplitReader reader = new HttpFileSplitReader(filePath);
        rows = execEnv
            .createInput(new GenericCsvInputFormat(reader, dummySchema.getFieldTypes(), rowDelim, rowDelim, ignoreFirstLine),
                new RowTypeInfo(dummySchema.getFieldTypes(), dummySchema.getFieldNames()))
            .name("http_csv_source");
    } else {
        RowCsvInputFormat inputFormat = new RowCsvInputFormat(
            new Path(filePath), dummySchema.getFieldTypes(), rowDelim, rowDelim, new int[]{0}, true);
        inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine);
        rows = execEnv.createInput(inputFormat).name("csv_source");
    }

    rows = rows.flatMap(new CsvUtil.ParseCsvFunc(colTypes, fieldDelim, quoteChar, skipBlankLine));

    return DataStreamConversionUtil.toTable(getMLEnvironmentId(), rows, colNames, colTypes);
}
 
Example 17
Source File: ConsumerSample.java    From aliyun-log-flink-connector with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
        final ParameterTool params = ParameterTool.fromArgs(args);
//        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // For local testing
        Configuration conf = new Configuration();
        conf.setString(CheckpointingOptions.CHECKPOINTS_DIRECTORY,
                "file:///Users/kel/Github/flink3/aliyun-log-flink-connector/flink2");
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(1, conf);
        env.getConfig().setGlobalJobParameters(params);
        env.setParallelism(1);
        env.enableCheckpointing(5000);
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);

        env.setStateBackend(new FsStateBackend("file:///Users/kel/Github/flink3/aliyun-log-flink-connector/flink"));
        Properties configProps = new Properties();
        configProps.put(ConfigConstants.LOG_ENDPOINT, SLS_ENDPOINT);
        configProps.put(ConfigConstants.LOG_ACCESSSKEYID, ACCESS_KEY_ID);
        configProps.put(ConfigConstants.LOG_ACCESSKEY, ACCESS_KEY_SECRET);
        configProps.put(ConfigConstants.LOG_MAX_NUMBER_PER_FETCH, "10");
        configProps.put(ConfigConstants.LOG_CONSUMER_BEGIN_POSITION, Consts.LOG_FROM_CHECKPOINT);
        configProps.put(ConfigConstants.LOG_CONSUMERGROUP, "23_ots_sla_etl_product1");
        configProps.put(ConfigConstants.LOG_CHECKPOINT_MODE, CheckpointMode.ON_CHECKPOINTS.name());
        configProps.put(ConfigConstants.LOG_COMMIT_INTERVAL_MILLIS, "10000");

        FastLogGroupDeserializer deserializer = new FastLogGroupDeserializer();
        DataStream<FastLogGroupList> stream = env.addSource(
                new FlinkLogConsumer<>(SLS_PROJECT, SLS_LOGSTORE, deserializer, configProps));

        stream.flatMap((FlatMapFunction<FastLogGroupList, String>) (value, out) -> {
            for (FastLogGroup logGroup : value.getLogGroups()) {
                int logCount = logGroup.getLogsCount();
                for (int i = 0; i < logCount; i++) {
                    FastLog log = logGroup.getLogs(i);
                    // processing log
                }
            }
        });
        stream.writeAsText("log-" + System.nanoTime());
        env.execute("Flink consumer");
    }
 
Example 18
Source File: ExactlyOnceChecker.java    From pravega-samples with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {
    LOG.info("Starting ExactlyOnce checker ...");

    // initialize the parameter utility tool in order to retrieve input parameters
    ParameterTool params = ParameterTool.fromArgs(args);

    PravegaConfig pravegaConfig = PravegaConfig
            .fromParams(params)
            .withControllerURI(URI.create(params.get(Constants.Default_URI_PARAM, Constants.Default_URI)))
            .withDefaultScope(params.get(Constants.SCOPE_PARAM, Constants.DEFAULT_SCOPE));

    // create the Pravega input stream (if necessary)
    Stream stream = Utils.createStream(
            pravegaConfig,
            params.get(Constants.STREAM_PARAM, Constants.DEFAULT_STREAM));

    // initialize Flink execution environment
    final StreamExecutionEnvironment env = StreamExecutionEnvironment
            .getExecutionEnvironment()
            .setParallelism(1);

    // create the Pravega source to read a stream of text
    FlinkPravegaReader<IntegerEvent> reader = FlinkPravegaReader.<IntegerEvent>builder()
            .withPravegaConfig(pravegaConfig)
            .forStream(stream)
            .withDeserializationSchema(PravegaSerialization.deserializationFor(IntegerEvent.class))
            .build();

    DataStream<IntegerEvent> dataStream = env
            .addSource(reader)
            .setParallelism(1);

    // create output stream to data read from Pravega
    //dataStream.print();

    DataStream<DuplicateEvent> duplicateStream = dataStream.flatMap(new FlatMapFunction<IntegerEvent, DuplicateEvent>() {
        @Override
        public void flatMap(IntegerEvent event, Collector<DuplicateEvent> out) throws Exception {

            if (event.isStart()) {
                // clear checker when the beginning of stream marker arrives
                checker.clear();
                duplicates.clear();
                System.out.println("\n============== Checker starts ===============");
            }
            if (event.isEnd()) {
                if (duplicates.size() == 0) {
                    System.out.println("No duplicate found. EXACTLY_ONCE!");
                } else {
                    System.out.println("Found duplicates");
                }
                System.out.println("============== Checker ends  ===============\n");
            }
            if (checker.contains(event)) {
                duplicates.add(event);
                DuplicateEvent dup = new DuplicateEvent(event.getValue());
                System.out.println(dup);
                out.collect(dup);
            } else {
                checker.add(event);
            }
        }
    });

    // create output sink to print duplicates
    //duplicateStream.print();

    // execute within the Flink environment
    env.execute("ExactlyOnceChecker");

    LOG.info("Ending ExactlyOnceChecker...");
}
 
Example 19
Source File: StreamExecutionEnvironment.java    From Flink-CEPplus with Apache License 2.0 3 votes vote down vote up
/**
 * Creates a data stream that contains the contents of file created while system watches the given path. The file
 * will be read with the system's default character set.
 *
 * @param filePath
 * 		The path of the file, as a URI (e.g., "file:///some/local/file" or "hdfs://host:port/file/path/")
 * @param intervalMillis
 * 		The interval of file watching in milliseconds
 * @param watchType
 * 		The watch type of file stream. When watchType is {@link org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#ONLY_NEW_FILES}, the system processes
 * 		only
 * 		new files. {@link org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#REPROCESS_WITH_APPENDED} means that the system re-processes all contents of
 * 		appended file. {@link org.apache.flink.streaming.api.functions.source.FileMonitoringFunction.WatchType#PROCESS_ONLY_APPENDED} means that the system processes only appended
 * 		contents
 * 		of files.
 * @return The DataStream containing the given directory.
 *
 * @deprecated Use {@link #readFile(FileInputFormat, String, FileProcessingMode, long)} instead.
 */
@Deprecated
@SuppressWarnings("deprecation")
public DataStream<String> readFileStream(String filePath, long intervalMillis, FileMonitoringFunction.WatchType watchType) {
	DataStream<Tuple3<String, Long, Long>> source = addSource(new FileMonitoringFunction(
			filePath, intervalMillis, watchType), "Read File Stream source");

	return source.flatMap(new FileReadFunction());
}
 
Example 20
Source File: BenchmarkJob.java    From scotty-window-processor with Apache License 2.0 2 votes vote down vote up
public BenchmarkJob(List<Window> assigner, StreamExecutionEnvironment env, final long runtime,
					final int throughput, final List<Tuple2<Long, Long>> gaps) {


	Map<String, String> configMap = new HashMap<>();
	ParameterTool parameters = ParameterTool.fromMap(configMap);

	env.getConfig().setGlobalJobParameters(parameters);
	env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
	env.setParallelism(1);
	env.setMaxParallelism(1);


	KeyedScottyWindowOperator<Tuple, Tuple4<String, Integer, Long, Long>, Tuple4<String, Integer, Long, Long>> windowOperator =
			new KeyedScottyWindowOperator<>(new SumAggregation());

	for(Window w: assigner){
		windowOperator.addWindow(w);
	}


	DataStream<Tuple4<String, Integer, Long, Long>> messageStream = env
		.addSource(new de.tub.dima.scotty.flinkBenchmark.LoadGeneratorSource(runtime, throughput,  gaps));

	messageStream.flatMap(new de.tub.dima.scotty.flinkBenchmark.ThroughputLogger<>(200, throughput));



	final SingleOutputStreamOperator<Tuple4<String, Integer, Long, Long>> timestampsAndWatermarks = messageStream
		.assignTimestampsAndWatermarks(new TimestampsAndWatermarks());



	timestampsAndWatermarks
			.keyBy(0)
			.process(windowOperator)
			.addSink(new SinkFunction() {

				@Override
				public void invoke(final Object value) throws Exception {
					//System.out.println(value);
				}
			});

	try {
		env.execute();

	} catch (Exception e) {
		e.printStackTrace();
	}

}