org.apache.flink.hadoopcompatibility.HadoopInputs Java Examples

The following examples show how to use org.apache.flink.hadoopcompatibility.HadoopInputs. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: FlinkUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
public static DataSet parseInputPath(String inputPath, FileSystem fs, ExecutionEnvironment env, Class keyClass,
        Class valueClass) throws IOException {
    List<String> inputFolders = Lists.newArrayList();
    Path inputHDFSPath = new Path(inputPath);
    FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath);
    boolean hasDir = false;
    for (FileStatus stat : fileStatuses) {
        if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
            hasDir = true;
            inputFolders.add(stat.getPath().toString());
        }
    }

    if (!hasDir) {
        return env.createInput(HadoopInputs.readSequenceFile(keyClass, valueClass, inputHDFSPath.toString()));
    }

    Job job = Job.getInstance();
    FileInputFormat.setInputPaths(job, StringUtil.join(inputFolders, ","));
    return env.createInput(HadoopInputs.createHadoopInput(new SequenceFileInputFormat(), keyClass, valueClass, job));
}
 
Example #2
Source File: FlinkUtil.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
public static DataSet<String[]> readHiveRecords(boolean isSequenceFile, ExecutionEnvironment env, String inputPath, String hiveTable, Job job) throws IOException {
    DataSet<String[]> recordDataSet;

    if (isSequenceFile) {
        recordDataSet = env
                .createInput(HadoopInputs.readHadoopFile(new SequenceFileInputFormat(), BytesWritable.class, Text.class, inputPath, job),
                        TypeInformation.of(new TypeHint<Tuple2<BytesWritable, Text>>() {}))
                .map(new MapFunction<Tuple2<BytesWritable, Text>, String[]>() {
                    @Override
                    public String[] map(Tuple2<BytesWritable, Text> tuple2) throws Exception {

                        String s = Bytes.toString(tuple2.f1.getBytes(), 0, tuple2.f1.getLength());
                        return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER);
                    }
                });
    } else {
        throw new UnsupportedOperationException("Currently, Flink does not support read hive table directly.");
    }

    return recordDataSet;
}
 
Example #3
Source File: FlinkUtil.java    From kylin with Apache License 2.0 6 votes vote down vote up
public static DataSet parseInputPath(String inputPath, FileSystem fs, ExecutionEnvironment env, Class keyClass,
        Class valueClass) throws IOException {
    List<String> inputFolders = Lists.newArrayList();
    Path inputHDFSPath = new Path(inputPath);
    FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath);
    boolean hasDir = false;
    for (FileStatus stat : fileStatuses) {
        if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
            hasDir = true;
            inputFolders.add(stat.getPath().toString());
        }
    }

    if (!hasDir) {
        return env.createInput(HadoopInputs.readSequenceFile(keyClass, valueClass, inputHDFSPath.toString()));
    }

    Job job = Job.getInstance();
    FileInputFormat.setInputPaths(job, StringUtil.join(inputFolders, ","));
    return env.createInput(HadoopInputs.createHadoopInput(new SequenceFileInputFormat(), keyClass, valueClass, job));
}
 
Example #4
Source File: FlinkUtil.java    From kylin with Apache License 2.0 6 votes vote down vote up
public static DataSet<String[]> readHiveRecords(boolean isSequenceFile, ExecutionEnvironment env, String inputPath, String hiveTable, Job job) throws IOException {
    DataSet<String[]> recordDataSet;

    if (isSequenceFile) {
        recordDataSet = env
                .createInput(HadoopInputs.readHadoopFile(new SequenceFileInputFormat(), BytesWritable.class, Text.class, inputPath, job),
                        TypeInformation.of(new TypeHint<Tuple2<BytesWritable, Text>>() {}))
                .map(new MapFunction<Tuple2<BytesWritable, Text>, String[]>() {
                    @Override
                    public String[] map(Tuple2<BytesWritable, Text> tuple2) throws Exception {

                        String s = Bytes.toString(tuple2.f1.getBytes(), 0, tuple2.f1.getLength());
                        return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER);
                    }
                });
    } else {
        throw new UnsupportedOperationException("Currently, Flink does not support read hive table directly.");
    }

    return recordDataSet;
}