org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink Java Examples

The following examples show how to use org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: OrcBulkWriterITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testOrcBulkWriter() throws Exception {
	final File outDir = TEMPORARY_FOLDER.newFolder();
	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	final Properties writerProps = new Properties();
	writerProps.setProperty("orc.compress", "LZ4");

	final OrcBulkWriterFactory<Record> factory = new OrcBulkWriterFactory<>(
		new RecordVectorizer(schema), writerProps, new Configuration());

	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<Record> stream = env.addSource(new FiniteTestSource<>(testData), TypeInformation.of(Record.class));
	stream.map(str -> str)
		.addSink(StreamingFileSink
			.forBulkFormat(new Path(outDir.toURI()), factory)
			.build());

	env.execute();

	OrcBulkWriterTestUtil.validate(outDir, testData);
}
 
Example #2
Source File: ParquetStreamingFileSinkITCase.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteParquetAvroReflect() throws Exception {

	final File folder = TEMPORARY_FOLDER.newFolder();

	final List<Datum> data = Arrays.asList(
			new Datum("a", 1), new Datum("b", 2), new Datum("c", 3));

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<Datum> stream = env.addSource(
			new FiniteTestSource<>(data), TypeInformation.of(Datum.class));

	stream.addSink(
			StreamingFileSink.forBulkFormat(
					Path.fromLocalFile(folder),
					ParquetAvroWriters.forReflectRecord(Datum.class))
					.build());

	env.execute();

	validateResults(folder, ReflectData.get(), data);
}
 
Example #3
Source File: ParquetStreamingFileSinkITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteParquetAvroReflect() throws Exception {

	final File folder = TEMPORARY_FOLDER.newFolder();

	final List<Datum> data = Arrays.asList(
			new Datum("a", 1), new Datum("b", 2), new Datum("c", 3));

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<Datum> stream = env.addSource(
			new FiniteTestSource<>(data), TypeInformation.of(Datum.class));

	stream.addSink(
			StreamingFileSink.forBulkFormat(
					Path.fromLocalFile(folder),
					ParquetAvroWriters.forReflectRecord(Datum.class))
					.build());

	env.execute();

	validateResults(folder, ReflectData.get(), data);
}
 
Example #4
Source File: ParquetStreamingFileSinkITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteParquetAvroReflect() throws Exception {

	final File folder = TEMPORARY_FOLDER.newFolder();

	final List<Datum> data = Arrays.asList(
			new Datum("a", 1), new Datum("b", 2), new Datum("c", 3));

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<Datum> stream = env.addSource(
			new FiniteTestSource<>(data), TypeInformation.of(Datum.class));

	stream.addSink(
			StreamingFileSink.forBulkFormat(
					Path.fromLocalFile(folder),
					ParquetAvroWriters.forReflectRecord(Datum.class))
					.build());

	env.execute();

	validateResults(folder, ReflectData.get(), data);
}
 
Example #5
Source File: AvroStreamingFileSinkITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteAvroReflect() throws Exception {
	File folder = TEMPORARY_FOLDER.newFolder();

	List<Datum> data = Arrays.asList(
		new Datum("a", 1),
		new Datum("b", 2),
		new Datum("c", 3));

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	AvroWriterFactory<Datum> avroWriterFactory = AvroWriters.forReflectRecord(Datum.class);
	DataStream<Datum> stream = env.addSource(
		new FiniteTestSource<>(data),
		TypeInformation.of(Datum.class));
	stream.addSink(StreamingFileSink.forBulkFormat(
		Path.fromLocalFile(folder),
		avroWriterFactory).build());
	env.execute();

	validateResults(folder, new ReflectDatumReader<>(Datum.class), data);
}
 
Example #6
Source File: AvroStreamingFileSinkITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteAvroGeneric() throws Exception {
	File folder = TEMPORARY_FOLDER.newFolder();

	Schema schema = Address.getClassSchema();
	Collection<GenericRecord> data = new GenericTestDataCollection();

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	AvroWriterFactory<GenericRecord> avroWriterFactory = AvroWriters.forGenericRecord(schema);
	DataStream<GenericRecord> stream = env.addSource(
		new FiniteTestSource<>(data),
		new GenericRecordAvroTypeInfo(schema));
	stream.addSink(StreamingFileSink.forBulkFormat(
		Path.fromLocalFile(folder),
		avroWriterFactory).build());
	env.execute();

	validateResults(folder, new GenericDatumReader<>(schema), new ArrayList<>(data));
}
 
Example #7
Source File: KafkaToHDFSSimpleJob.java    From flink-tutorials with Apache License 2.0 6 votes vote down vote up
public static void main(String[] args) throws Exception {

		ParameterTool params = Utils.parseArgs(args);
		StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

		FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<>(params.getRequired("kafkaTopic"), new SimpleStringSchema(), Utils.readKafkaProperties(params));
		DataStream<String> source = env.addSource(consumer).name("Kafka Source").uid("Kafka Source");

		StreamingFileSink<String> sink = StreamingFileSink
				.forRowFormat(new Path(params.getRequired("hdfsOutput")), new SimpleStringEncoder<String>("UTF-8"))
				.build();

		source.addSink(sink).name("FS Sink").uid("FS Sink");
		source.print();

		env.execute("Flink Streaming Secured Job Sample");
	}
 
Example #8
Source File: AvroStreamingFileSinkITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteAvroSpecific() throws Exception {
	File folder = TEMPORARY_FOLDER.newFolder();

	List<Address> data = Arrays.asList(
		new Address(1, "a", "b", "c", "12345"),
		new Address(2, "p", "q", "r", "12345"),
		new Address(3, "x", "y", "z", "12345"));

	StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	AvroWriterFactory<Address> avroWriterFactory = AvroWriters.forSpecificRecord(Address.class);
	DataStream<Address> stream = env.addSource(
		new FiniteTestSource<>(data),
		TypeInformation.of(Address.class));
	stream.addSink(StreamingFileSink.forBulkFormat(
		Path.fromLocalFile(folder),
		avroWriterFactory).build());
	env.execute();

	validateResults(folder, new SpecificDatumReader<>(Address.class), data);
}
 
Example #9
Source File: HdfsSink2.java    From sylph with Apache License 2.0 6 votes vote down vote up
public LocalShuffle(int split, RichSinkFunction<T> userSink)
        throws IOException, ClassNotFoundException, IllegalAccessException, NoSuchFieldException
{
    this.sinks = new ArrayList<>(split);
    SerializedValue<RichSinkFunction<T>> serializedValue = new SerializedValue<>(userSink);
    for (int i = 0; i < split; i++) {
        StreamingFileSink<T> sink = (StreamingFileSink<T>) serializedValue.deserializeValue(this.getClass().getClassLoader());
        Field field = StreamingFileSink.class.getDeclaredField("bucketsBuilder");
        field.setAccessible(true);
        StreamingFileSink<T> mockSink = new StreamingFileSink<T>((StreamingFileSink.BulkFormatBuilder<T, ?>) field.get(sink), 0)
        {
            @Override
            public RuntimeContext getRuntimeContext()
            {
                return LocalShuffle.this.getRuntimeContext();
            }
        };
    }
}
 
Example #10
Source File: CompressionFactoryITCase.java    From flink with Apache License 2.0 6 votes vote down vote up
@Test
public void testWriteCompressedFile() throws Exception {
	final File folder = TEMPORARY_FOLDER.newFolder();
	final Path testPath = Path.fromLocalFile(folder);

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<String> stream = env.addSource(
			new FiniteTestSource<>(testData),
			TypeInformation.of(String.class)
	);

	stream.map(str -> str).addSink(
			StreamingFileSink.forBulkFormat(
					testPath,
					CompressWriters.forExtractor(new DefaultExtractor<String>()).withHadoopCompression(TEST_CODEC_NAME)
			).build());

	env.execute();

	validateResults(folder, testData, new CompressionCodecFactory(configuration).getCodecByName(TEST_CODEC_NAME));
}
 
Example #11
Source File: StreamingFileWriter.java    From flink with Apache License 2.0 5 votes vote down vote up
public StreamingFileWriter(
		long bucketCheckInterval,
		StreamingFileSink.BucketsBuilder<RowData, String, ? extends
				StreamingFileSink.BucketsBuilder<RowData, String, ?>> bucketsBuilder) {
	this.bucketCheckInterval = bucketCheckInterval;
	this.bucketsBuilder = bucketsBuilder;
	setChainingStrategy(ChainingStrategy.ALWAYS);
}
 
Example #12
Source File: MatrixVectorMul.java    From flink with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

		// Checking input parameters
		final ParameterTool params = ParameterTool.fromArgs(args);
		System.out.println("Usage: MatrixVectorMul [--output <path>] [--dimension <dimension> --data-size <data_size>] [--resource-name <resource_name>]");

		// Set up the execution environment
		final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

		// Make parameters available in the web interface
		env.getConfig().setGlobalJobParameters(params);

		final int dimension = params.getInt("dimension", DEFAULT_DIM);
		final int dataSize = params.getInt("data-size", DEFAULT_DATA_SIZE);
		final String resourceName = params.get("resource-name", DEFAULT_RESOURCE_NAME);

		DataStream<List<Float>> result = env.addSource(new RandomVectorSource(dimension, dataSize))
						.map(new Multiplier(dimension, resourceName));

		// Emit result
		if (params.has("output")) {
			result.addSink(StreamingFileSink.forRowFormat(new Path(params.get("output")),
					new SimpleStringEncoder<List<Float>>()).build());
		} else {
			System.out.println("Printing result to stdout. Use --output to specify output path.");
			result.print();
		}
		// Execute program
		env.execute("Matrix-Vector Multiplication");
	}
 
Example #13
Source File: OrcBulkWriterTest.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testOrcBulkWriter() throws Exception {
	final File outDir = TEMPORARY_FOLDER.newFolder();
	final Properties writerProps = new Properties();
	writerProps.setProperty("orc.compress", "LZ4");

	final OrcBulkWriterFactory<Record> writer = new OrcBulkWriterFactory<>(
		new RecordVectorizer(schema), writerProps, new Configuration());

	StreamingFileSink<Record> sink = StreamingFileSink
		.forBulkFormat(new Path(outDir.toURI()), writer)
		.withBucketCheckInterval(10000)
		.build();

	try (OneInputStreamOperatorTestHarness<Record, Object> testHarness = new OneInputStreamOperatorTestHarness<>(
			new StreamSink<>(sink), 1, 1, 0)) {

		testHarness.setup();
		testHarness.open();

		int time = 0;
		for (final Record record : input) {
			testHarness.processElement(record, ++time);
		}

		testHarness.snapshot(1, ++time);
		testHarness.notifyOfCompletedCheckpoint(1);

		OrcBulkWriterTestUtil.validate(outDir, input);
	}
}
 
Example #14
Source File: ParquetStreamingFileSinkITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteParquetAvroGeneric() throws Exception {

	final File folder = TEMPORARY_FOLDER.newFolder();

	final Schema schema = Address.getClassSchema();

	final Collection<GenericRecord> data = new GenericTestDataCollection();

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<GenericRecord> stream = env.addSource(
			new FiniteTestSource<>(data), new GenericRecordAvroTypeInfo(schema));

	stream.addSink(
			StreamingFileSink.forBulkFormat(
					Path.fromLocalFile(folder),
					ParquetAvroWriters.forGenericRecord(schema))
					.build());

	env.execute();

	List<Address> expected = Arrays.asList(
			new Address(1, "a", "b", "c", "12345"),
			new Address(2, "x", "y", "z", "98765"));

	validateResults(folder, SpecificData.get(), expected);
}
 
Example #15
Source File: ParquetStreamingFileSinkITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteParquetAvroSpecific() throws Exception {

	final File folder = TEMPORARY_FOLDER.newFolder();

	final List<Address> data = Arrays.asList(
			new Address(1, "a", "b", "c", "12345"),
			new Address(2, "p", "q", "r", "12345"),
			new Address(3, "x", "y", "z", "12345")
	);

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<Address> stream = env.addSource(
			new FiniteTestSource<>(data), TypeInformation.of(Address.class));

	stream.addSink(
			StreamingFileSink.forBulkFormat(
					Path.fromLocalFile(folder),
					ParquetAvroWriters.forSpecificRecord(Address.class))
			.build());

	env.execute();

	validateResults(folder, SpecificData.get(), data);
}
 
Example #16
Source File: CompressWriterFactoryTest.java    From flink with Apache License 2.0 5 votes vote down vote up
private File prepareCompressedFile(CompressWriterFactory<String> writer, List<String> lines) throws Exception {
	final File outDir = TEMPORARY_FOLDER.newFolder();

	final BucketAssigner<String, String> assigner = new BucketAssigner<String, String> () {
		@Override
		public String getBucketId(String element, BucketAssigner.Context context) {
			return "bucket";
		}

		@Override
		public SimpleVersionedSerializer<String> getSerializer() {
			return SimpleVersionedStringSerializer.INSTANCE;
		}
	};

	StreamingFileSink<String> sink = StreamingFileSink
		.forBulkFormat(new Path(outDir.toURI()), writer)
		.withBucketAssigner(assigner)
		.build();

	try (
		OneInputStreamOperatorTestHarness<String, Object> testHarness = new OneInputStreamOperatorTestHarness<>(new StreamSink<>(sink), 1, 1, 0)
	) {
		testHarness.setup();
		testHarness.open();

		int time = 0;
		for (String line: lines) {
			testHarness.processElement(new StreamRecord<>(line, ++time));
		}

		testHarness.snapshot(1, ++time);
		testHarness.notifyOfCompletedCheckpoint(1);
	}

	return outDir;
}
 
Example #17
Source File: SequenceStreamingFileSinkITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteSequenceFile() throws Exception {
	final File folder = TEMPORARY_FOLDER.newFolder();
	final Path testPath = Path.fromLocalFile(folder);

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<Tuple2<Long, String>> stream = env.addSource(
			new FiniteTestSource<>(testData),
			TypeInformation.of(new TypeHint<Tuple2<Long, String>>() {

			})
	);

	stream.map(new MapFunction<Tuple2<Long, String>, Tuple2<LongWritable, Text>>() {
		@Override
		public Tuple2<LongWritable, Text> map(Tuple2<Long, String> value) throws Exception {
			return new Tuple2<>(new LongWritable(value.f0), new Text(value.f1));
		}
	}).addSink(
		StreamingFileSink.forBulkFormat(
			testPath,
			new SequenceFileWriterFactory<>(configuration, LongWritable.class, Text.class, "BZip2")
		).build());

	env.execute();

	validateResults(folder, testData);
}
 
Example #18
Source File: SequenceStreamingFileSinkITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteSequenceFile() throws Exception {
	final File folder = TEMPORARY_FOLDER.newFolder();
	final Path testPath = Path.fromLocalFile(folder);

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<Tuple2<Long, String>> stream = env.addSource(
			new FiniteTestSource<>(testData),
			TypeInformation.of(new TypeHint<Tuple2<Long, String>>() {

			})
	);

	stream.map(new MapFunction<Tuple2<Long, String>, Tuple2<LongWritable, Text>>() {
		@Override
		public Tuple2<LongWritable, Text> map(Tuple2<Long, String> value) throws Exception {
			return new Tuple2<>(new LongWritable(value.f0), new Text(value.f1));
		}
	}).addSink(
		StreamingFileSink.forBulkFormat(
			testPath,
			new SequenceFileWriterFactory<>(configuration, LongWritable.class, Text.class, "BZip2")
		).build());

	env.execute();

	validateResults(folder, testData);
}
 
Example #19
Source File: KafkaToHDFSAvroJob.java    From flink-tutorials with Apache License 2.0 5 votes vote down vote up
public static void main(String[] args) throws Exception {

        ParameterTool params = Utils.parseArgs(args);

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        KafkaDeserializationSchema<Message> schema = ClouderaRegistryKafkaDeserializationSchema
                .builder(Message.class)
                .setConfig(Utils.readSchemaRegistryProperties(params))
                .build();

        FlinkKafkaConsumer<Message> consumer = new FlinkKafkaConsumer<Message>(params.getRequired(K_KAFKA_TOPIC), schema, Utils.readKafkaProperties(params));

        DataStream<String> source = env.addSource(consumer)
                .name("Kafka Source")
                .uid("Kafka Source")
                .map(record -> record.getId() + "," + record.getName() + "," + record.getDescription())
                .name("ToOutputString");

        StreamingFileSink<String> sink = StreamingFileSink
                .forRowFormat(new Path(params.getRequired(K_HDFS_OUTPUT)), new SimpleStringEncoder<String>("UTF-8"))
                .build();

        source.addSink(sink)
                .name("FS Sink")
                .uid("FS Sink");

        source.print();

        env.execute("Flink Streaming Secured Job Sample");
    }
 
Example #20
Source File: ParquetStreamingFileSinkITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteParquetAvroGeneric() throws Exception {

	final File folder = TEMPORARY_FOLDER.newFolder();

	final Schema schema = Address.getClassSchema();

	final Collection<GenericRecord> data = new GenericTestDataCollection();

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<GenericRecord> stream = env.addSource(
			new FiniteTestSource<>(data), new GenericRecordAvroTypeInfo(schema));

	stream.addSink(
			StreamingFileSink.forBulkFormat(
					Path.fromLocalFile(folder),
					ParquetAvroWriters.forGenericRecord(schema))
					.build());

	env.execute();

	List<Address> expected = Arrays.asList(
			new Address(1, "a", "b", "c", "12345"),
			new Address(2, "x", "y", "z", "98765"));

	validateResults(folder, SpecificData.get(), expected);
}
 
Example #21
Source File: ParquetStreamingFileSinkITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteParquetAvroSpecific() throws Exception {

	final File folder = TEMPORARY_FOLDER.newFolder();

	final List<Address> data = Arrays.asList(
			new Address(1, "a", "b", "c", "12345"),
			new Address(2, "p", "q", "r", "12345"),
			new Address(3, "x", "y", "z", "12345")
	);

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<Address> stream = env.addSource(
			new FiniteTestSource<>(data), TypeInformation.of(Address.class));

	stream.addSink(
			StreamingFileSink.forBulkFormat(
					Path.fromLocalFile(folder),
					ParquetAvroWriters.forSpecificRecord(Address.class))
			.build());

	env.execute();

	validateResults(folder, SpecificData.get(), data);
}
 
Example #22
Source File: SequenceStreamingFileSinkITCase.java    From flink with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteSequenceFile() throws Exception {
	final File folder = TEMPORARY_FOLDER.newFolder();
	final Path testPath = Path.fromLocalFile(folder);

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<Tuple2<Long, String>> stream = env.addSource(
			new FiniteTestSource<>(testData),
			TypeInformation.of(new TypeHint<Tuple2<Long, String>>() {

			})
	);

	stream.map(new MapFunction<Tuple2<Long, String>, Tuple2<LongWritable, Text>>() {
		@Override
		public Tuple2<LongWritable, Text> map(Tuple2<Long, String> value) throws Exception {
			return new Tuple2<>(new LongWritable(value.f0), new Text(value.f1));
		}
	}).addSink(
		StreamingFileSink.forBulkFormat(
			testPath,
			new SequenceFileWriterFactory<>(configuration, LongWritable.class, Text.class, "BZip2")
		).build());

	env.execute();

	validateResults(folder, testData);
}
 
Example #23
Source File: ParquetStreamingFileSinkITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteParquetAvroGeneric() throws Exception {

	final File folder = TEMPORARY_FOLDER.newFolder();

	final Schema schema = Address.getClassSchema();

	final Collection<GenericRecord> data = new GenericTestDataCollection();

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<GenericRecord> stream = env.addSource(
			new FiniteTestSource<>(data), new GenericRecordAvroTypeInfo(schema));

	stream.addSink(
			StreamingFileSink.forBulkFormat(
					Path.fromLocalFile(folder),
					ParquetAvroWriters.forGenericRecord(schema))
					.build());

	env.execute();

	List<Address> expected = Arrays.asList(
			new Address(1, "a", "b", "c", "12345"),
			new Address(2, "x", "y", "z", "98765"));

	validateResults(folder, SpecificData.get(), expected);
}
 
Example #24
Source File: ParquetStreamingFileSinkITCase.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Test
public void testWriteParquetAvroSpecific() throws Exception {

	final File folder = TEMPORARY_FOLDER.newFolder();

	final List<Address> data = Arrays.asList(
			new Address(1, "a", "b", "c", "12345"),
			new Address(2, "p", "q", "r", "12345"),
			new Address(3, "x", "y", "z", "12345")
	);

	final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
	env.setParallelism(1);
	env.enableCheckpointing(100);

	DataStream<Address> stream = env.addSource(
			new FiniteTestSource<>(data), TypeInformation.of(Address.class));

	stream.addSink(
			StreamingFileSink.forBulkFormat(
					Path.fromLocalFile(folder),
					ParquetAvroWriters.forSpecificRecord(Address.class))
			.build());

	env.execute();

	validateResults(folder, SpecificData.get(), data);
}
 
Example #25
Source File: HdfsSink2.java    From sylph with Apache License 2.0 4 votes vote down vote up
@Override
public void run(DataStream<Row> stream)
{
    final RichSinkFunction<byte[]> sink = StreamingFileSink.forBulkFormat(
            new Path(writerDir),
            (BulkWriter.Factory<byte[]>) fsDataOutputStream -> new BulkWriter<byte[]>()
            {
                private final CompressionCodec codec = ReflectionUtils.newInstance(codecClass, new Configuration());
                private final CompressionOutputStream outputStream = codec.createOutputStream(fsDataOutputStream);
                private long bufferSize;

                @Override
                public void addElement(byte[] element)
                        throws IOException
                {
                    outputStream.write(element);
                    outputStream.write(10); //write \n
                    bufferSize += element.length;
                    if (bufferSize >= batchSize) {
                        outputStream.flush();
                        this.bufferSize = 0;
                    }
                }

                @Override
                public void flush()
                        throws IOException
                {
                    outputStream.flush();
                }

                @Override
                public void finish()
                        throws IOException
                {
                    outputStream.finish();
                    outputStream.close();
                }
            })
            .withBucketAssigner(new DateTimeBucketAssigner<>("yyyy-MM-dd--HH"))
            .build();
    stream.map(row -> {
        StringBuilder builder = new StringBuilder();
        for (int i = 0; i < row.getArity(); i++) {
            builder.append("\u0001").append(row.getField(i));
        }
        return builder.substring(1).getBytes(UTF_8);
    })
            .addSink(sink)
            .name(this.getClass().getSimpleName());
}
 
Example #26
Source File: StreamSQLTestProgram.java    From flink with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

		ParameterTool params = ParameterTool.fromArgs(args);
		String outputPath = params.getRequired("outputPath");
		String planner = params.get("planner", "old");

		final EnvironmentSettings.Builder builder = EnvironmentSettings.newInstance();
		builder.inStreamingMode();

		if (planner.equals("old")) {
			builder.useOldPlanner();
		} else if (planner.equals("blink")) {
			builder.useBlinkPlanner();
		}

		final EnvironmentSettings settings = builder.build();

		final StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment();
		sEnv.setRestartStrategy(RestartStrategies.fixedDelayRestart(
			3,
			Time.of(10, TimeUnit.SECONDS)
		));
		sEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
		sEnv.enableCheckpointing(4000);
		sEnv.getConfig().setAutoWatermarkInterval(1000);

		final StreamTableEnvironment tEnv = StreamTableEnvironment.create(sEnv, settings);

		tEnv.registerTableSource("table1", new GeneratorTableSource(10, 100, 60, 0));
		tEnv.registerTableSource("table2", new GeneratorTableSource(5, 0.2f, 60, 5));

		int overWindowSizeSeconds = 1;
		int tumbleWindowSizeSeconds = 10;

		String overQuery = String.format(
			"SELECT " +
			"  key, " +
			"  rowtime, " +
			"  COUNT(*) OVER (PARTITION BY key ORDER BY rowtime RANGE BETWEEN INTERVAL '%d' SECOND PRECEDING AND CURRENT ROW) AS cnt " +
			"FROM table1",
			overWindowSizeSeconds);

		String tumbleQuery = String.format(
			"SELECT " +
			"  key, " +
			"  CASE SUM(cnt) / COUNT(*) WHEN 101 THEN 1 ELSE 99 END AS correct, " +
			"  TUMBLE_START(rowtime, INTERVAL '%d' SECOND) AS wStart, " +
			"  TUMBLE_ROWTIME(rowtime, INTERVAL '%d' SECOND) AS rowtime " +
			"FROM (%s) " +
			"WHERE rowtime > TIMESTAMP '1970-01-01 00:00:01' " +
			"GROUP BY key, TUMBLE(rowtime, INTERVAL '%d' SECOND)",
			tumbleWindowSizeSeconds,
			tumbleWindowSizeSeconds,
			overQuery,
			tumbleWindowSizeSeconds);

		String joinQuery = String.format(
			"SELECT " +
			"  t1.key, " +
			"  t2.rowtime AS rowtime, " +
			"  t2.correct," +
			"  t2.wStart " +
			"FROM table2 t1, (%s) t2 " +
			"WHERE " +
			"  t1.key = t2.key AND " +
			"  t1.rowtime BETWEEN t2.rowtime AND t2.rowtime + INTERVAL '%d' SECOND",
			tumbleQuery,
			tumbleWindowSizeSeconds);

		String finalAgg = String.format(
			"SELECT " +
			"  SUM(correct) AS correct, " +
			"  TUMBLE_START(rowtime, INTERVAL '20' SECOND) AS rowtime " +
			"FROM (%s) " +
			"GROUP BY TUMBLE(rowtime, INTERVAL '20' SECOND)",
			joinQuery);

		// get Table for SQL query
		Table result = tEnv.sqlQuery(finalAgg);
		// convert Table into append-only DataStream
		DataStream<Row> resultStream =
			tEnv.toAppendStream(result, Types.ROW(Types.INT, Types.SQL_TIMESTAMP));

		final StreamingFileSink<Row> sink = StreamingFileSink
			.forRowFormat(new Path(outputPath), (Encoder<Row>) (element, stream) -> {
				PrintStream out = new PrintStream(stream);
				out.println(element.toString());
			})
			.withBucketAssigner(new KeyBucketAssigner())
			.withRollingPolicy(OnCheckpointRollingPolicy.build())
			.build();

		resultStream
			// inject a KillMapper that forwards all records but terminates the first execution attempt
			.map(new KillMapper()).setParallelism(1)
			// add sink function
			.addSink(sink).setParallelism(1);

		sEnv.execute();
	}
 
Example #27
Source File: StreamSQLTestProgram.java    From Flink-CEPplus with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

		ParameterTool params = ParameterTool.fromArgs(args);
		String outputPath = params.getRequired("outputPath");

		StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment();
		sEnv.setRestartStrategy(RestartStrategies.fixedDelayRestart(
			3,
			Time.of(10, TimeUnit.SECONDS)
		));
		sEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
		sEnv.enableCheckpointing(4000);
		sEnv.getConfig().setAutoWatermarkInterval(1000);

		StreamTableEnvironment tEnv = StreamTableEnvironment.create(sEnv);

		tEnv.registerTableSource("table1", new GeneratorTableSource(10, 100, 60, 0));
		tEnv.registerTableSource("table2", new GeneratorTableSource(5, 0.2f, 60, 5));

		int overWindowSizeSeconds = 1;
		int tumbleWindowSizeSeconds = 10;

		String overQuery = String.format(
			"SELECT " +
			"  key, " +
			"  rowtime, " +
			"  COUNT(*) OVER (PARTITION BY key ORDER BY rowtime RANGE BETWEEN INTERVAL '%d' SECOND PRECEDING AND CURRENT ROW) AS cnt " +
			"FROM table1",
			overWindowSizeSeconds);

		String tumbleQuery = String.format(
			"SELECT " +
			"  key, " +
			"  CASE SUM(cnt) / COUNT(*) WHEN 101 THEN 1 ELSE 99 END AS correct, " +
			"  TUMBLE_START(rowtime, INTERVAL '%d' SECOND) AS wStart, " +
			"  TUMBLE_ROWTIME(rowtime, INTERVAL '%d' SECOND) AS rowtime " +
			"FROM (%s) " +
			"WHERE rowtime > TIMESTAMP '1970-01-01 00:00:01' " +
			"GROUP BY key, TUMBLE(rowtime, INTERVAL '%d' SECOND)",
			tumbleWindowSizeSeconds,
			tumbleWindowSizeSeconds,
			overQuery,
			tumbleWindowSizeSeconds);

		String joinQuery = String.format(
			"SELECT " +
			"  t1.key, " +
			"  t2.rowtime AS rowtime, " +
			"  t2.correct," +
			"  t2.wStart " +
			"FROM table2 t1, (%s) t2 " +
			"WHERE " +
			"  t1.key = t2.key AND " +
			"  t1.rowtime BETWEEN t2.rowtime AND t2.rowtime + INTERVAL '%d' SECOND",
			tumbleQuery,
			tumbleWindowSizeSeconds);

		String finalAgg = String.format(
			"SELECT " +
			"  SUM(correct) AS correct, " +
			"  TUMBLE_START(rowtime, INTERVAL '20' SECOND) AS rowtime " +
			"FROM (%s) " +
			"GROUP BY TUMBLE(rowtime, INTERVAL '20' SECOND)",
			joinQuery);

		// get Table for SQL query
		Table result = tEnv.sqlQuery(finalAgg);
		// convert Table into append-only DataStream
		DataStream<Row> resultStream =
			tEnv.toAppendStream(result, Types.ROW(Types.INT, Types.SQL_TIMESTAMP));

		final StreamingFileSink<Row> sink = StreamingFileSink
			.forRowFormat(new Path(outputPath), (Encoder<Row>) (element, stream) -> {
				PrintStream out = new PrintStream(stream);
				out.println(element.toString());
			})
			.withBucketAssigner(new KeyBucketAssigner())
			.withRollingPolicy(OnCheckpointRollingPolicy.build())
			.build();

		resultStream
			// inject a KillMapper that forwards all records but terminates the first execution attempt
			.map(new KillMapper()).setParallelism(1)
			// add sink function
			.addSink(sink).setParallelism(1);

		sEnv.execute();
	}
 
Example #28
Source File: StreamSQLTestProgram.java    From flink with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws Exception {

		ParameterTool params = ParameterTool.fromArgs(args);
		String outputPath = params.getRequired("outputPath");
		String planner = params.get("planner", "blink");

		final EnvironmentSettings.Builder builder = EnvironmentSettings.newInstance();
		builder.inStreamingMode();

		if (planner.equals("old")) {
			builder.useOldPlanner();
		} else if (planner.equals("blink")) {
			builder.useBlinkPlanner();
		}

		final EnvironmentSettings settings = builder.build();

		final StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment();
		sEnv.setRestartStrategy(RestartStrategies.fixedDelayRestart(
			3,
			Time.of(10, TimeUnit.SECONDS)
		));
		sEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
		sEnv.enableCheckpointing(4000);
		sEnv.getConfig().setAutoWatermarkInterval(1000);

		final StreamTableEnvironment tEnv = StreamTableEnvironment.create(sEnv, settings);

		((TableEnvironmentInternal) tEnv).registerTableSourceInternal("table1", new GeneratorTableSource(10, 100, 60, 0));
		((TableEnvironmentInternal) tEnv).registerTableSourceInternal("table2", new GeneratorTableSource(5, 0.2f, 60, 5));

		int overWindowSizeSeconds = 1;
		int tumbleWindowSizeSeconds = 10;

		String overQuery = String.format(
			"SELECT " +
			"  key, " +
			"  rowtime, " +
			"  COUNT(*) OVER (PARTITION BY key ORDER BY rowtime RANGE BETWEEN INTERVAL '%d' SECOND PRECEDING AND CURRENT ROW) AS cnt " +
			"FROM table1",
			overWindowSizeSeconds);

		String tumbleQuery = String.format(
			"SELECT " +
			"  key, " +
			"  CASE SUM(cnt) / COUNT(*) WHEN 101 THEN 1 ELSE 99 END AS correct, " +
			"  TUMBLE_START(rowtime, INTERVAL '%d' SECOND) AS wStart, " +
			"  TUMBLE_ROWTIME(rowtime, INTERVAL '%d' SECOND) AS rowtime " +
			"FROM (%s) " +
			"WHERE rowtime > TIMESTAMP '1970-01-01 00:00:01' " +
			"GROUP BY key, TUMBLE(rowtime, INTERVAL '%d' SECOND)",
			tumbleWindowSizeSeconds,
			tumbleWindowSizeSeconds,
			overQuery,
			tumbleWindowSizeSeconds);

		String joinQuery = String.format(
			"SELECT " +
			"  t1.key, " +
			"  t2.rowtime AS rowtime, " +
			"  t2.correct," +
			"  t2.wStart " +
			"FROM table2 t1, (%s) t2 " +
			"WHERE " +
			"  t1.key = t2.key AND " +
			"  t1.rowtime BETWEEN t2.rowtime AND t2.rowtime + INTERVAL '%d' SECOND",
			tumbleQuery,
			tumbleWindowSizeSeconds);

		String finalAgg = String.format(
			"SELECT " +
			"  SUM(correct) AS correct, " +
			"  TUMBLE_START(rowtime, INTERVAL '20' SECOND) AS rowtime " +
			"FROM (%s) " +
			"GROUP BY TUMBLE(rowtime, INTERVAL '20' SECOND)",
			joinQuery);

		// get Table for SQL query
		Table result = tEnv.sqlQuery(finalAgg);
		// convert Table into append-only DataStream
		DataStream<Row> resultStream =
			tEnv.toAppendStream(result, Types.ROW(Types.INT, Types.SQL_TIMESTAMP));

		final StreamingFileSink<Row> sink = StreamingFileSink
			.forRowFormat(new Path(outputPath), (Encoder<Row>) (element, stream) -> {
				PrintStream out = new PrintStream(stream);
				out.println(element.toString());
			})
			.withBucketAssigner(new KeyBucketAssigner())
			.withRollingPolicy(OnCheckpointRollingPolicy.build())
			.build();

		resultStream
			// inject a KillMapper that forwards all records but terminates the first execution attempt
			.map(new KillMapper()).setParallelism(1)
			// add sink function
			.addSink(sink).setParallelism(1);

		sEnv.execute();
	}