package org.talend.components.simplefileio.runtime.hadoop.csv; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.SplittableCompressionCodec; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.util.StopWatch; /** * A CSV Input format which can set the header line to skip some line. the * header line number should be a small number, if a big one even near by the * end of the file, it mean we process the whole CSV in one task, no meaning for * map reduce way. * */ public class CSVFileInputFormat extends org.apache.hadoop.mapreduce.lib.input.FileInputFormat<LongWritable, BytesWritable> { public static String TALEND_ENCODING = "talend_encoding"; public static String TALEND_TEXT_ENCLOSURE = "talend_text_enclosure"; public static String TALEND_ESCAPE = "talend_escape"; // not in the design public static String TALEND_ROW_DELIMITED = "talend_row_delimited"; public static String TALEND_HEADER = "talend_header"; private static final Log LOG = LogFactory.getLog(CSVFileInputFormat.class); private static final double SPLIT_SLOP = 1.1; @Override public CSVFileRecordReader createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException { String delimiter = context.getConfiguration().get(TALEND_ROW_DELIMITED); String encoding = context.getConfiguration().get(TALEND_ENCODING); String textEnclosure = context.getConfiguration().get(TALEND_TEXT_ENCLOSURE); String escapeChar = context.getConfiguration().get(TALEND_ESCAPE); Character te = null; Character ec = null; if (textEnclosure != null && !textEnclosure.isEmpty()) { te = textEnclosure.charAt(0); } if (escapeChar != null && !escapeChar.isEmpty()) { ec = escapeChar.charAt(0); } return createRecordReader(delimiter, encoding, te, ec); } private CSVFileRecordReader createRecordReader(final String rowDelimiter, final String encoding, final Character textEnclosure, final Character escapeChar) throws IOException { return new CSVFileRecordReader(rowDelimiter, encoding, textEnclosure, escapeChar); } private long caculateSkipLength(FileStatus file, JobContext job) throws IOException { long header = job.getConfiguration().getLong(TALEND_HEADER, 0l); String rowDelimiter = job.getConfiguration().get(TALEND_ROW_DELIMITED); String encoding = job.getConfiguration().get(TALEND_ENCODING); if (header < 1) { return 0l; } try (CSVFileRecordReader reader = this.createRecordReader(rowDelimiter, encoding, null, null)) { // TODO check if right for compress especially return reader.skipHeader(file, header, job); } } @Override public List<InputSplit> getSplits(JobContext job) throws IOException { StopWatch sw = new StopWatch().start(); long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path path = file.getPath(); long length = file.getLen(); if (length != 0) { long skipLength = caculateSkipLength(file, job); BlockLocation[] blkLocations; if (file instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) file).getBlockLocations(); } else { FileSystem fs = path.getFileSystem(job.getConfiguration()); blkLocations = fs.getFileBlockLocations(file, 0, length); } int splitIndex = 0; if (isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length - skipLength; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { long offset = length - bytesRemaining; int blkIndex = getBlockIndex(blkLocations, offset); splits.add(makeSplit(path, offset, splitSize, splitIndex++, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { long offset = length - bytesRemaining; int blkIndex = getBlockIndex(blkLocations, offset); splits.add(makeSplit(path, offset, bytesRemaining, splitIndex++, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts())); } } else { // not splitable splits.add(makeSplit(path, skipLength, length - skipLength, splitIndex++, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts())); } } else { // Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, 0, new String[0])); } } // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); sw.stop(); if (LOG.isDebugEnabled()) { LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS)); } return splits; } @Override protected boolean isSplitable(JobContext context, Path filename) { String text_enclosure = context.getConfiguration().get(TALEND_TEXT_ENCLOSURE); String talend_escape = context.getConfiguration().get(TALEND_ESCAPE); if ((text_enclosure != null && !text_enclosure.isEmpty()) || (talend_escape != null && !talend_escape.isEmpty())) { return false; } final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename); if (null == codec) { return true; } return codec instanceof SplittableCompressionCodec; } @Override protected List<FileStatus> listStatus(JobContext job) throws IOException { // TODO consider if filter the sub dir return super.listStatus(job); } protected InputSplit makeSplit(Path file, long start, long length, int splitIndex, String[] hosts) { return new CSVFileSplit(file, start, length, splitIndex, hosts); } protected InputSplit makeSplit(Path file, long start, long length, int splitIndex, String[] hosts, String[] inMemoryHosts) { return new CSVFileSplit(file, start, length, splitIndex, hosts, inMemoryHosts); } }