org.apache.flink.api.common.serialization.BulkWriter Java Examples

The following examples show how to use org.apache.flink.api.common.serialization.BulkWriter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestUtils.java    From flink with Apache License 2.0 6 votes vote down vote up
static <ID> OneInputStreamOperatorTestHarness<Tuple2<String, Integer>, Object> createTestSinkWithCustomizedBulkEncoder(
		final File outDir,
		final int totalParallelism,
		final int taskIdx,
		final long bucketCheckInterval,
		final BucketAssigner<Tuple2<String, Integer>, ID> bucketer,
		final BulkWriter.Factory<Tuple2<String, Integer>> writer,
		final BucketFactory<Tuple2<String, Integer>, ID> bucketFactory,
		final OutputFileConfig outputFileConfig) throws Exception {

	StreamingFileSink<Tuple2<String, Integer>> sink = StreamingFileSink
			.forBulkFormat(new Path(outDir.toURI()), writer)
			.withNewBucketAssigner(bucketer)
			.withRollingPolicy(build())
			.withBucketCheckInterval(bucketCheckInterval)
			.withBucketFactory(bucketFactory)
			.withOutputFileConfig(outputFileConfig)
			.build();

	return new OneInputStreamOperatorTestHarness<>(new StreamSink<>(sink), MAX_PARALLELISM, totalParallelism, taskIdx);
}
 
Example #2
Source File: TestUtils.java    From flink with Apache License 2.0 6 votes vote down vote up
static OneInputStreamOperatorTestHarness<Tuple2<String, Integer>, Object> createTestSinkWithBulkEncoder(
		final File outDir,
		final int totalParallelism,
		final int taskIdx,
		final long bucketCheckInterval,
		final BucketAssigner<Tuple2<String, Integer>, String> bucketer,
		final BulkWriter.Factory<Tuple2<String, Integer>> writer,
		final BucketFactory<Tuple2<String, Integer>, String> bucketFactory,
		final String partFilePrefix,
		final String partFileSuffix) throws Exception {

	StreamingFileSink<Tuple2<String, Integer>> sink = StreamingFileSink
		.forBulkFormat(new Path(outDir.toURI()), writer)
		.withBucketAssigner(bucketer)
		.withBucketCheckInterval(bucketCheckInterval)
		.withBucketFactory(bucketFactory)
		.withPartFilePrefix(partFilePrefix)
		.withPartFileSuffix(partFileSuffix)
		.build();

	return new OneInputStreamOperatorTestHarness<>(new StreamSink<>(sink), MAX_PARALLELISM, totalParallelism, taskIdx);
}
 
Example #3
Source File: TestUtils.java    From flink with Apache License 2.0 6 votes vote down vote up
static <ID> OneInputStreamOperatorTestHarness<Tuple2<String, Integer>, Object> createTestSinkWithCustomizedBulkEncoder(
		final File outDir,
		final int totalParallelism,
		final int taskIdx,
		final long bucketCheckInterval,
		final BucketAssigner<Tuple2<String, Integer>, ID> bucketer,
		final BulkWriter.Factory<Tuple2<String, Integer>> writer,
		final BucketFactory<Tuple2<String, Integer>, ID> bucketFactory) throws Exception {

	return createTestSinkWithCustomizedBulkEncoder(
			outDir,
			totalParallelism,
			taskIdx,
			bucketCheckInterval,
			bucketer,
			writer,
			bucketFactory,
			OutputFileConfig.builder().build());
}
 
Example #4
Source File: FileSystemTableSink.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
public BulkWriter<RowData> create(FSDataOutputStream out) throws IOException {
	BulkWriter<RowData> writer = factory.create(out);
	return new BulkWriter<RowData>() {

		@Override
		public void addElement(RowData element) throws IOException {
			writer.addElement(computer.projectColumnsToWrite(element));
		}

		@Override
		public void flush() throws IOException {
			writer.flush();
		}

		@Override
		public void finish() throws IOException {
			writer.finish();
		}
	};
}
 
Example #5
Source File: TestUtils.java    From flink with Apache License 2.0 6 votes vote down vote up
static OneInputStreamOperatorTestHarness<Tuple2<String, Integer>, Object> createTestSinkWithBulkEncoder(
		final File outDir,
		final int totalParallelism,
		final int taskIdx,
		final long bucketCheckInterval,
		final BucketAssigner<Tuple2<String, Integer>, String> bucketer,
		final BulkWriter.Factory<Tuple2<String, Integer>> writer,
		final BucketFactory<Tuple2<String, Integer>, String> bucketFactory) throws Exception {

	return createTestSinkWithBulkEncoder(
			outDir,
			totalParallelism,
			taskIdx,
			bucketCheckInterval,
			bucketer,
			writer,
			bucketFactory,
			PartFileConfig.DEFAULT_PART_PREFIX,
			PartFileConfig.DEFAULT_PART_SUFFIX);
}
 
Example #6
Source File: TestUtils.java    From Flink-CEPplus with Apache License 2.0 6 votes vote down vote up
static OneInputStreamOperatorTestHarness<Tuple2<String, Integer>, Object> createTestSinkWithBulkEncoder(
		final File outDir,
		final int totalParallelism,
		final int taskIdx,
		final long bucketCheckInterval,
		final BucketAssigner<Tuple2<String, Integer>, String> bucketer,
		final BulkWriter.Factory<Tuple2<String, Integer>> writer,
		final BucketFactory<Tuple2<String, Integer>, String> bucketFactory) throws Exception {

	StreamingFileSink<Tuple2<String, Integer>> sink = StreamingFileSink
			.forBulkFormat(new Path(outDir.toURI()), writer)
			.withBucketAssigner(bucketer)
			.withBucketCheckInterval(bucketCheckInterval)
			.withBucketFactory(bucketFactory)
			.build();

	return new OneInputStreamOperatorTestHarness<>(new StreamSink<>(sink), MAX_PARALLELISM, totalParallelism, taskIdx);
}
 
Example #7
Source File: TestUtils.java    From flink with Apache License 2.0 6 votes vote down vote up
static OneInputStreamOperatorTestHarness<Tuple2<String, Integer>, Object> createTestSinkWithBulkEncoder(
		final File outDir,
		final int totalParallelism,
		final int taskIdx,
		final long bucketCheckInterval,
		final BucketAssigner<Tuple2<String, Integer>, String> bucketer,
		final BulkWriter.Factory<Tuple2<String, Integer>> writer,
		final BucketFactory<Tuple2<String, Integer>, String> bucketFactory,
		final OutputFileConfig outputFileConfig) throws Exception {

	StreamingFileSink<Tuple2<String, Integer>> sink = StreamingFileSink
		.forBulkFormat(new Path(outDir.toURI()), writer)
		.withBucketAssigner(bucketer)
		.withBucketCheckInterval(bucketCheckInterval)
		.withRollingPolicy(build())
		.withBucketFactory(bucketFactory)
		.withOutputFileConfig(outputFileConfig)
		.build();

	return new OneInputStreamOperatorTestHarness<>(new StreamSink<>(sink), MAX_PARALLELISM, totalParallelism, taskIdx);
}
 
Example #8
Source File: TestUtils.java    From flink with Apache License 2.0 6 votes vote down vote up
static OneInputStreamOperatorTestHarness<Tuple2<String, Integer>, Object> createTestSinkWithBulkEncoder(
		final File outDir,
		final int totalParallelism,
		final int taskIdx,
		final long bucketCheckInterval,
		final BucketAssigner<Tuple2<String, Integer>, String> bucketer,
		final BulkWriter.Factory<Tuple2<String, Integer>> writer,
		final BucketFactory<Tuple2<String, Integer>, String> bucketFactory) throws Exception {

	return createTestSinkWithBulkEncoder(
			outDir,
			totalParallelism,
			taskIdx,
			bucketCheckInterval,
			bucketer,
			writer,
			bucketFactory,
			OutputFileConfig.builder().build());
}
 
Example #9
Source File: HiveTableSink.java    From flink with Apache License 2.0 6 votes vote down vote up
private Optional<BulkWriter.Factory<RowData>> createBulkWriterFactory(String[] partitionColumns,
		StorageDescriptor sd) {
	String serLib = sd.getSerdeInfo().getSerializationLib().toLowerCase();
	int formatFieldCount = tableSchema.getFieldCount() - partitionColumns.length;
	String[] formatNames = new String[formatFieldCount];
	LogicalType[] formatTypes = new LogicalType[formatFieldCount];
	for (int i = 0; i < formatFieldCount; i++) {
		formatNames[i] = tableSchema.getFieldName(i).get();
		formatTypes[i] = tableSchema.getFieldDataType(i).get().getLogicalType();
	}
	RowType formatType = RowType.of(formatTypes, formatNames);
	Configuration formatConf = new Configuration(jobConf);
	sd.getSerdeInfo().getParameters().forEach(formatConf::set);
	if (serLib.contains("parquet")) {
		return Optional.of(ParquetRowDataBuilder.createWriterFactory(
				formatType, formatConf, hiveVersion.startsWith("3.")));
	} else if (serLib.contains("orc")) {
		TypeDescription typeDescription = OrcSplitReaderUtil.logicalTypeToOrcType(formatType);
		return Optional.of(hiveShim.createOrcBulkWriterFactory(
				formatConf, typeDescription.toString(), formatTypes));
	} else {
		return Optional.empty();
	}
}
 
Example #10
Source File: AvroFileSystemFormatFactory.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
public BulkWriter<RowData> create(FSDataOutputStream out) throws IOException {
	BulkWriter<GenericRecord> writer = factory.create(out);
	AvroRowDataSerializationSchema.SerializationRuntimeConverter converter =
			AvroRowDataSerializationSchema.createRowConverter(rowType);
	Schema schema = AvroSchemaConverter.convertToSchema(rowType);
	return new BulkWriter<RowData>() {

		@Override
		public void addElement(RowData element) throws IOException {
			GenericRecord record = (GenericRecord) converter.convert(schema, element);
			writer.addElement(record);
		}

		@Override
		public void flush() throws IOException {
			writer.flush();
		}

		@Override
		public void finish() throws IOException {
			writer.finish();
		}
	};
}
 
Example #11
Source File: StreamingFileSink.java    From flink with Apache License 2.0 6 votes vote down vote up
private BulkFormatBuilder(
		Path basePath,
		BulkWriter.Factory<IN> writerFactory,
		BucketAssigner<IN, BucketID> assigner,
		long bucketCheckInterval,
		BucketFactory<IN, BucketID> bucketFactory,
		String partFilePrefix,
		String partFileSuffix) {
	this.basePath = Preconditions.checkNotNull(basePath);
	this.writerFactory = writerFactory;
	this.bucketAssigner = Preconditions.checkNotNull(assigner);
	this.bucketCheckInterval = bucketCheckInterval;
	this.bucketFactory = Preconditions.checkNotNull(bucketFactory);
	this.partFilePrefix = Preconditions.checkNotNull(partFilePrefix);
	this.partFileSuffix = Preconditions.checkNotNull(partFileSuffix);
}
 
Example #12
Source File: StreamingFileSink.java    From flink with Apache License 2.0 6 votes vote down vote up
protected BulkFormatBuilder(
		Path basePath,
		BulkWriter.Factory<IN> writerFactory,
		BucketAssigner<IN, BucketID> assigner,
		CheckpointRollingPolicy<IN, BucketID> policy,
		long bucketCheckInterval,
		BucketFactory<IN, BucketID> bucketFactory,
		OutputFileConfig outputFileConfig) {
	this.basePath = Preconditions.checkNotNull(basePath);
	this.writerFactory = writerFactory;
	this.bucketAssigner = Preconditions.checkNotNull(assigner);
	this.rollingPolicy = Preconditions.checkNotNull(policy);
	this.bucketCheckInterval = bucketCheckInterval;
	this.bucketFactory = Preconditions.checkNotNull(bucketFactory);
	this.outputFileConfig = Preconditions.checkNotNull(outputFileConfig);
}
 
Example #13
Source File: FileSystemTableSink.java    From flink with Apache License 2.0 5 votes vote down vote up
private static OutputFormat<RowData> createBulkWriterOutputFormat(
		BulkWriter.Factory<RowData> factory,
		Path path) {
	return new OutputFormat<RowData>() {

		private static final long serialVersionUID = 1L;

		private transient BulkWriter<RowData> writer;

		@Override
		public void configure(Configuration parameters) {
		}

		@Override
		public void open(int taskNumber, int numTasks) throws IOException {
			this.writer = factory.create(path.getFileSystem()
					.create(path, FileSystem.WriteMode.OVERWRITE));
		}

		@Override
		public void writeRecord(RowData record) throws IOException {
			writer.addElement(record);
		}

		@Override
		public void close() throws IOException {
			writer.flush();
			writer.finish();
		}
	};
}
 
Example #14
Source File: CompressWriterFactory.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public BulkWriter<IN> create(FSDataOutputStream out) throws IOException {
	if (hadoopCodecName == null || hadoopCodecName.trim().isEmpty()) {
		return new NoCompressionBulkWriter<>(out, extractor);
	}

	initializeCompressionCodec();

	return new HadoopCompressionBulkWriter<>(hadoopCodec.createOutputStream(out), extractor);
}
 
Example #15
Source File: HiveShimV200.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public BulkWriter.Factory<RowData> createOrcBulkWriterFactory(
		Configuration conf, String schema, LogicalType[] fieldTypes) {
	return new OrcBulkWriterFactory<>(
			new RowDataVectorizer(schema, fieldTypes),
			new Properties(),
			conf);
}
 
Example #16
Source File: FileSystemTableSink.java    From flink with Apache License 2.0 5 votes vote down vote up
@SuppressWarnings("unchecked")
private OutputFormatFactory<RowData> createOutputFormatFactory() {
	Object writer = createWriter();
	return writer instanceof Encoder ?
			path -> createEncoderOutputFormat((Encoder<RowData>) writer, path) :
			path -> createBulkWriterOutputFormat((BulkWriter.Factory<RowData>) writer, path);
}
 
Example #17
Source File: FileSystemTableSink.java    From flink with Apache License 2.0 5 votes vote down vote up
private Object createWriter() {
	FileSystemFormatFactory formatFactory = createFormatFactory(properties);
	Configuration conf = new Configuration();
	properties.forEach(conf::setString);

	FileSystemFormatFactory.WriterContext context = new FileSystemFormatFactory.WriterContext() {

		@Override
		public TableSchema getSchema() {
			return schema;
		}

		@Override
		public ReadableConfig getFormatOptions() {
			return new DelegatingConfiguration(conf, formatFactory.factoryIdentifier() + ".");
		}

		@Override
		public List<String> getPartitionKeys() {
			return partitionKeys;
		}
	};

	Optional<Encoder<RowData>> encoder = formatFactory.createEncoder(context);
	Optional<BulkWriter.Factory<RowData>> bulk = formatFactory.createBulkWriterFactory(context);

	if (encoder.isPresent()) {
		return encoder.get();
	} else if (bulk.isPresent()) {
		return bulk.get();
	} else {
		throw new TableException(
				formatFactory + " format should implement at least one Encoder or BulkWriter");
	}
}
 
Example #18
Source File: BulkPartWriter.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public PartFileWriter<IN, BucketID> openNew(
		final BucketID bucketId,
		final RecoverableFsDataOutputStream stream,
		final Path path,
		final long creationTime) throws IOException {

	Preconditions.checkNotNull(stream);
	Preconditions.checkNotNull(path);

	final BulkWriter<IN> writer = writerFactory.create(stream);
	return new BulkPartWriter<>(bucketId, stream, writer, creationTime);
}
 
Example #19
Source File: OrcFileSystemFormatFactory.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public Optional<BulkWriter.Factory<RowData>> createBulkWriterFactory(WriterContext context) {
	LogicalType[] orcTypes = Arrays.stream(context.getFormatFieldTypes())
			.map(DataType::getLogicalType)
			.toArray(LogicalType[]::new);

	TypeDescription typeDescription = OrcSplitReaderUtil.logicalTypeToOrcType(
			RowType.of(orcTypes, context.getFormatFieldNames()));

	OrcBulkWriterFactory<RowData> factory = new OrcBulkWriterFactory<>(
			new RowDataVectorizer(typeDescription.toString(), orcTypes),
			getOrcProperties(context.getFormatOptions()),
			new Configuration());
	return Optional.of(factory);
}
 
Example #20
Source File: OrcBulkWriterFactory.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public BulkWriter<T> create(FSDataOutputStream out) throws IOException {
	OrcFile.WriterOptions opts = getWriterOptions();
	opts.physicalWriter(new PhysicalWriterImpl(out, opts));

	return new OrcBulkWriter<>(vectorizer, new WriterImpl(null, FIXED_PATH, opts));
}
 
Example #21
Source File: TestCsvFileSystemFormatFactory.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public Optional<BulkWriter.Factory<RowData>> createBulkWriterFactory(WriterContext context) {
	if (!useBulkWriter(context)) {
		return Optional.empty();
	}

	DataType[] types = context.getFormatFieldTypes();
	return Optional.of(out -> new CsvBulkWriter(types, out));
}
 
Example #22
Source File: OrcNoHiveBulkWriterFactory.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public BulkWriter<RowData> create(FSDataOutputStream out) throws IOException {
	OrcFile.WriterOptions opts = OrcFile.writerOptions(new Properties(), conf);
	TypeDescription description = TypeDescription.fromString(schema);
	opts.setSchema(description);
	opts.physicalWriter(new PhysicalWriterImpl(out, opts));
	WriterImpl writer = new WriterImpl(null, new Path("."), opts);

	VectorizedRowBatch rowBatch = description.createRowBatch();
	return new BulkWriter<RowData>() {
		@Override
		public void addElement(RowData row) throws IOException {
			int rowId = rowBatch.size++;
			for (int i = 0; i < row.getArity(); ++i) {
				setColumn(rowId, rowBatch.cols[i], fieldTypes[i], row, i);
			}
			if (rowBatch.size == rowBatch.getMaxSize()) {
				writer.addRowBatch(rowBatch);
				rowBatch.reset();
			}
		}

		@Override
		public void flush() throws IOException {
			if (rowBatch.size != 0) {
				writer.addRowBatch(rowBatch);
				rowBatch.reset();
			}
		}

		@Override
		public void finish() throws IOException {
			flush();
			writer.close();
		}
	};
}
 
Example #23
Source File: BulkBucketWriter.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public InProgressFileWriter<IN, BucketID> resumeFrom(
		final BucketID bucketId,
		final RecoverableFsDataOutputStream stream,
		final RecoverableWriter.ResumeRecoverable resumable,
		final long creationTime) throws IOException {

	Preconditions.checkNotNull(stream);
	Preconditions.checkNotNull(resumable);

	final BulkWriter<IN> writer = writerFactory.create(stream);
	return new BulkPartWriter<>(bucketId, stream, writer, creationTime);
}
 
Example #24
Source File: BulkBucketWriter.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public InProgressFileWriter<IN, BucketID> openNew(
		final BucketID bucketId,
		final RecoverableFsDataOutputStream stream,
		final Path path,
		final long creationTime) throws IOException {

	Preconditions.checkNotNull(stream);
	Preconditions.checkNotNull(path);

	final BulkWriter<IN> writer = writerFactory.create(stream);
	return new BulkPartWriter<>(bucketId, stream, writer, creationTime);
}
 
Example #25
Source File: ParquetFileSystemFormatFactory.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public Optional<BulkWriter.Factory<RowData>> createBulkWriterFactory(WriterContext context) {
	return Optional.of(ParquetRowDataBuilder.createWriterFactory(
			RowType.of(Arrays.stream(context.getFormatFieldTypes())
							.map(DataType::getLogicalType)
							.toArray(LogicalType[]::new),
					context.getFormatFieldNames()),
			getParquetConfiguration(context.getFormatOptions()),
			context.getFormatOptions().get(UTC_TIMEZONE)));
}
 
Example #26
Source File: BulkPartWriter.java    From flink with Apache License 2.0 5 votes vote down vote up
@Override
public PartFileWriter<IN, BucketID> resumeFrom(
		final BucketID bucketId,
		final RecoverableFsDataOutputStream stream,
		final RecoverableWriter.ResumeRecoverable resumable,
		final long creationTime) throws IOException {

	Preconditions.checkNotNull(stream);
	Preconditions.checkNotNull(resumable);

	final BulkWriter<IN> writer = writerFactory.create(stream);
	return new BulkPartWriter<>(bucketId, stream, writer, creationTime);
}
 
Example #27
Source File: BulkPartWriter.java    From flink with Apache License 2.0 5 votes vote down vote up
private BulkPartWriter(
		final BucketID bucketId,
		final RecoverableFsDataOutputStream currentPartStream,
		final BulkWriter<IN> writer,
		final long creationTime) {
	super(bucketId, currentPartStream, creationTime);
	this.writer = Preconditions.checkNotNull(writer);
}
 
Example #28
Source File: BulkPartWriter.java    From flink with Apache License 2.0 5 votes vote down vote up
BulkPartWriter(
		final BucketID bucketId,
		final RecoverableFsDataOutputStream currentPartStream,
		final BulkWriter<IN> writer,
		final long creationTime) {
	super(bucketId, currentPartStream, creationTime);
	this.writer = Preconditions.checkNotNull(writer);
}
 
Example #29
Source File: BulkPartWriter.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
public PartFileWriter<IN, BucketID> openNew(
		final BucketID bucketId,
		final RecoverableFsDataOutputStream stream,
		final Path path,
		final long creationTime) throws IOException {

	Preconditions.checkNotNull(stream);
	Preconditions.checkNotNull(path);

	final BulkWriter<IN> writer = writerFactory.create(stream);
	return new BulkPartWriter<>(bucketId, stream, writer, creationTime);
}
 
Example #30
Source File: BulkPartWriter.java    From Flink-CEPplus with Apache License 2.0 5 votes vote down vote up
@Override
public PartFileWriter<IN, BucketID> resumeFrom(
		final BucketID bucketId,
		final RecoverableFsDataOutputStream stream,
		final RecoverableWriter.ResumeRecoverable resumable,
		final long creationTime) throws IOException {

	Preconditions.checkNotNull(stream);
	Preconditions.checkNotNull(resumable);

	final BulkWriter<IN> writer = writerFactory.create(stream);
	return new BulkPartWriter<>(bucketId, stream, writer, creationTime);
}