org.apache.flink.api.java.io.RowCsvInputFormat Java Examples

The following examples show how to use org.apache.flink.api.java.io.RowCsvInputFormat. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CsvTableSource.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
public void open(FunctionContext context) throws Exception {
	super.open(context);
	TypeInformation<Row> rowType = getResultType();

	RowCsvInputFormat inputFormat = config.createInputFormat();
	FileInputSplit[] inputSplits = inputFormat.createInputSplits(1);
	for (FileInputSplit split : inputSplits) {
		inputFormat.open(split);
		Row row = new Row(rowType.getArity());
		while (true) {
			Row r = inputFormat.nextRecord(row);
			if (r == null) {
				break;
			} else {
				Object key = getTargetKey(r);
				List<Row> rows = dataMap.computeIfAbsent(key, k -> new ArrayList<>());
				rows.add(Row.copy(r));
			}
		}
		inputFormat.close();
	}
}
 
Example #2
Source File: CsvTableSource.java    From flink with Apache License 2.0 6 votes vote down vote up
@Override
public void open(FunctionContext context) throws Exception {
	super.open(context);
	TypeInformation<Row> rowType = getResultType();

	RowCsvInputFormat inputFormat = config.createInputFormat();
	FileInputSplit[] inputSplits = inputFormat.createInputSplits(1);
	for (FileInputSplit split : inputSplits) {
		inputFormat.open(split);
		Row row = new Row(rowType.getArity());
		while (true) {
			Row r = inputFormat.nextRecord(row);
			if (r == null) {
				break;
			} else {
				Object key = getTargetKey(r);
				List<Row> rows = dataMap.computeIfAbsent(key, k -> new ArrayList<>());
				rows.add(Row.copy(r));
			}
		}
		inputFormat.close();
	}
}
 
Example #3
Source File: CsvTableSource.java    From flink with Apache License 2.0 6 votes vote down vote up
RowCsvInputFormat createInputFormat() {
	RowCsvInputFormat inputFormat = new RowCsvInputFormat(
		new Path(path),
		getSelectedFieldTypes(),
		lineDelim,
		fieldDelim,
		selectedFields,
		emptyColumnAsNull);
	inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine);
	inputFormat.setCommentPrefix(ignoreComments);
	inputFormat.setLenient(lenient);
	if (quoteCharacter != null) {
		inputFormat.enableQuotedStringParsing(quoteCharacter);
	}
	return inputFormat;
}
 
Example #4
Source File: CsvTableSource.java    From flink with Apache License 2.0 5 votes vote down vote up
RowCsvInputFormat createInputFormat() {
	RowCsvInputFormat inputFormat = new RowCsvInputFormat(
		new Path(path),
		getSelectedFieldTypes(),
		lineDelim,
		fieldDelim,
		selectedFields);
	inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine);
	inputFormat.setCommentPrefix(ignoreComments);
	inputFormat.setLenient(lenient);
	if (quoteCharacter != null) {
		inputFormat.enableQuotedStringParsing(quoteCharacter);
	}
	return inputFormat;
}
 
Example #5
Source File: TaxiRides.java    From infoworld-post with Apache License 2.0 5 votes vote down vote up
/**
 * Returns a DataStream of TaxiRide events from a CSV file.
 *
 * @param env The execution environment.
 * @param csvFile The path of the CSV file to read.
 * @return A DataStream of TaxiRide events.
 */
public static DataStream<TaxiRide> getRides(StreamExecutionEnvironment env, String csvFile) {

    // create input format to read the CSV file
    RowCsvInputFormat inputFormat = new RowCsvInputFormat(
            null, // input path is configured later
            inputFieldTypes,
            "\n",
            ",");

    // read file sequentially (with a parallelism of 1)
    DataStream<Row> parsedRows = env
            .readFile(inputFormat, csvFile)
            .returns(Types.ROW(inputFieldTypes))
            .setParallelism(1);

    // convert parsed CSV rows into TaxiRides, extract timestamps, and assign watermarks
    return parsedRows
            // map to TaxiRide POJOs
            .map(new RideMapper())
            // define drop-off time as event-time timestamps and generate ascending watermarks.
            .assignTimestampsAndWatermarks(new AscendingTimestampExtractor<TaxiRide>() {
                @Override
                public long extractAscendingTimestamp(TaxiRide ride) {
                    return ride.dropOffTime;
                }
            });
}
 
Example #6
Source File: TestRowDataCsvInputFormat.java    From flink with Apache License 2.0 5 votes vote down vote up
public TestRowDataCsvInputFormat(
		Path[] paths,
		TableSchema schema,
		List<String> partitionKeys,
		String defaultPartValue,
		int[] selectFields,
		long limit) {
	this.partitionKeys = partitionKeys;
	this.defaultPartValue = defaultPartValue;
	this.selectFields = selectFields;
	this.limit = limit;
	RowTypeInfo rowType = (RowTypeInfo) schema.toRowType();
	this.fieldTypes = Arrays.asList(rowType.getFieldTypes());
	this.fieldNames = Arrays.asList(rowType.getFieldNames());

	List<String> csvFieldNames = fieldNames.stream()
			.filter(name -> !partitionKeys.contains(name)).collect(Collectors.toList());

	List<String> selectFieldNames = Arrays.stream(selectFields)
			.mapToObj(fieldNames::get)
			.collect(Collectors.toList());
	List<String> csvSelectFieldNames = selectFieldNames.stream()
			.filter(name -> !partitionKeys.contains(name)).collect(Collectors.toList());
	List<TypeInformation> csvSelectTypes = csvSelectFieldNames.stream()
			.map(name -> fieldTypes.get(fieldNames.indexOf(name))).collect(Collectors.toList());
	this.csvSelectConverters = csvSelectTypes.stream()
			.map(TypeConversions::fromLegacyInfoToDataType)
			.map(DataFormatConverters::getConverterForDataType)
			.collect(Collectors.toList());
	int[] csvSelectFields = csvSelectFieldNames.stream().mapToInt(csvFieldNames::indexOf).toArray();
	this.inputFormat = new RowCsvInputFormat(
			null, csvSelectTypes.toArray(new TypeInformation[0]), csvSelectFields);
	this.inputFormat.setFilePaths(paths);

	this.csvFieldMapping = csvSelectFieldNames.stream().mapToInt(selectFieldNames::indexOf).toArray();
	this.emitted = 0;
}
 
Example #7
Source File: CsvSourceStreamOp.java    From Alink with Apache License 2.0 4 votes vote down vote up
@Override
public Table initializeDataSource() {
    final String filePath = getFilePath();
    final String schemaStr = getSchemaStr();
    final String fieldDelim = getFieldDelimiter();
    final String rowDelim = getRowDelimiter();
    final Character quoteChar = getQuoteChar();
    final boolean skipBlankLine = getSkipBlankLine();

    final String[] colNames = CsvUtil.getColNames(schemaStr);
    final TypeInformation[] colTypes = CsvUtil.getColTypes(schemaStr);

    boolean ignoreFirstLine = getIgnoreFirstLine();
    String protocol = "";

    try {
        URL url = new URL(filePath);
        protocol = url.getProtocol();
    } catch (MalformedURLException ignored) {
    }

    DataStream<Row> rows;
    StreamExecutionEnvironment execEnv =
        MLEnvironmentFactory.get(getMLEnvironmentId()).getStreamExecutionEnvironment();
    TableSchema dummySchema = new TableSchema(new String[]{"f1"}, new TypeInformation[]{Types.STRING});

    if (protocol.equalsIgnoreCase("http") || protocol.equalsIgnoreCase("https")) {
        HttpFileSplitReader reader = new HttpFileSplitReader(filePath);
        rows = execEnv
            .createInput(new GenericCsvInputFormat(reader, dummySchema.getFieldTypes(), rowDelim, rowDelim, ignoreFirstLine),
                new RowTypeInfo(dummySchema.getFieldTypes(), dummySchema.getFieldNames()))
            .name("http_csv_source");
    } else {
        RowCsvInputFormat inputFormat = new RowCsvInputFormat(
            new Path(filePath), dummySchema.getFieldTypes(), rowDelim, rowDelim, new int[]{0}, true);
        inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine);
        rows = execEnv.createInput(inputFormat).name("csv_source");
    }

    rows = rows.flatMap(new CsvUtil.ParseCsvFunc(colTypes, fieldDelim, quoteChar, skipBlankLine));

    return DataStreamConversionUtil.toTable(getMLEnvironmentId(), rows, colNames, colTypes);
}
 
Example #8
Source File: CsvSourceBatchOp.java    From Alink with Apache License 2.0 4 votes vote down vote up
@Override
public Table initializeDataSource() {
    final String filePath = getFilePath();
    final String schemaStr = getSchemaStr();
    final String fieldDelim = getFieldDelimiter();
    final String rowDelim = getRowDelimiter();
    final Character quoteChar = getQuoteChar();
    final boolean skipBlankLine = getSkipBlankLine();

    final String[] colNames = CsvUtil.getColNames(schemaStr);
    final TypeInformation[] colTypes = CsvUtil.getColTypes(schemaStr);

    boolean ignoreFirstLine = getIgnoreFirstLine();
    String protocol = "";

    try {
        URL url = new URL(filePath);
        protocol = url.getProtocol();
    } catch (MalformedURLException ignored) {
    }

    DataSet<Row> rows;
    ExecutionEnvironment execEnv = MLEnvironmentFactory.get(getMLEnvironmentId()).getExecutionEnvironment();
    TableSchema dummySchema = new TableSchema(new String[]{"f1"}, new TypeInformation[]{Types.STRING});

    if (protocol.equalsIgnoreCase("http") || protocol.equalsIgnoreCase("https")) {
        HttpFileSplitReader reader = new HttpFileSplitReader(filePath);
        rows = execEnv
            .createInput(new GenericCsvInputFormat(reader, dummySchema.getFieldTypes(), rowDelim, rowDelim, ignoreFirstLine),
                new RowTypeInfo(dummySchema.getFieldTypes(), dummySchema.getFieldNames()))
            .name("http_csv_source");
    } else {
        RowCsvInputFormat inputFormat = new RowCsvInputFormat(
            new Path(filePath), dummySchema.getFieldTypes(), rowDelim, rowDelim, new int[]{0}, true);
        inputFormat.setSkipFirstLineAsHeader(ignoreFirstLine);
        rows = execEnv.createInput(inputFormat).name("csv_source");
    }

    rows = rows.flatMap(new CsvUtil.ParseCsvFunc(colTypes, fieldDelim, quoteChar, skipBlankLine));

    return DataSetConversionUtil.toTable(getMLEnvironmentId(), rows, colNames, colTypes);
}