Java Code Examples for org.apache.hadoop.mapreduce.lib.input.FileSplit#getLength()
The following examples show how to use
org.apache.hadoop.mapreduce.lib.input.FileSplit#getLength() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: RFileRecordReader.java From datawave with Apache License 2.0 | 6 votes |
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; // Note that the RFileInputFormat returns false for "isSplittable", so this should ALWAYS be 0 start = fileSplit.getStart(); if (start != 0) { throw new IOException("Cannot start reading an RFile in the middle: start=" + start); } end = fileSplit.getLength() - start; pos = start; FileOperations ops = RFileOperations.getInstance(); String file = fileSplit.getPath().toString(); FileSystem fs = fileSplit.getPath().getFileSystem(context.getConfiguration()); fileIterator = ops.newReaderBuilder().forFile(file, fs, context.getConfiguration()) .withTableConfiguration(AccumuloConfiguration.getDefaultConfiguration()).seekToBeginning().build(); }
Example 2
Source File: MsgpackRecordReader.java From laser with Apache License 2.0 | 6 votes |
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; String host = fileSplit.getPath().toString(); Configuration conf = context.getConfiguration(); String collection = conf.get("com.b5m.laser.collection"); int port = conf.getInt("com.b5m.laser.msgpack.port", 0); method = conf.get("com.b5m.laser.msgpack.input.method"); client = new MsgpackClient(host, port, collection); start = fileSplit.getStart(); splitLenth = fileSplit.getLength(); readLength = 0; vClass = conf.getClass("com.b5m.laser.msgpack.input.value.class", null); try { Object[] req = new Object[1]; req[0] = start; client.writeIgnoreRetValue(req, method + "|start"); } catch (Exception e) { throw new IOException(e.getLocalizedMessage()); } }
Example 3
Source File: InterRecordReader.java From spork with Apache License 2.0 | 6 votes |
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); if (start != 0) { fileIn.seek(start); } in = new BufferedPositionedInputStream(fileIn, start); inData = new DataInputStream(in); }
Example 4
Source File: SSTableSplitRecordReader.java From hadoop-sstable with Apache License 2.0 | 6 votes |
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext taskAttemptContext) throws IOException { this.context = taskAttemptContext; FileSplit fileSplit = (FileSplit) genericSplit; this.sstableFile = fileSplit.getPath(); // The SSTableSplitInputFormat is not splittable, so the split length is the whole file. this.totalFileSize = fileSplit.getLength(); Configuration conf = context.getConfiguration(); FileSystem fs = sstableFile.getFileSystem(conf); this.rawInputStream = fs.open(sstableFile); this.splitSize = conf.getLong(HadoopSSTableConstants.HADOOP_SSTABLE_SPLIT_MB, HadoopSSTableConstants.DEFAULT_SPLIT_MB) * 1024 * 1024; this.index = new IndexOffsetScanner(sstableFile, fs); }
Example 5
Source File: IndexedStorage.java From spork with Apache License 2.0 | 6 votes |
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt(MRConfiguration.LINERECORDREADER_MAXLENGTH, Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new IndexedStorageLineReader(fileIn, job); if (skipFirstLine) { start += in.readLine(new Text(), 0, (int)Math.min((long)Integer.MAX_VALUE, end - start)); } this.pos = start; }
Example 6
Source File: JsonDataValidationMapper.java From jumbune with GNU Lesser General Public License v3.0 | 5 votes |
protected void setup(Mapper.Context context){ String jsonString = context.getConfiguration().get(JsonDataVaildationConstants.JSON_ARGUMENT); String regexString = context.getConfiguration().get(JsonDataVaildationConstants.REGEX_ARGUMENT); String nullString = context.getConfiguration().get(JsonDataVaildationConstants.NULL_ARGUMENT); tupleCounter = 0L; cleanTupleCounter =0L; recordsEmittByMap = 0L; //Populating JsonKey and Data type schema = getDatatypeExpression(jsonString); // Adding JsonKey given by user keylist = getKeyList(jsonString); if(!(regexString == null)){ //Populating JsonKey and Regex regex = getExpression(regexString); } if(!(nullString == null)){ //Populating JsonKey and NULLCONDITION nullMap = getExpression(nullString); } FileSplit fileSplit = (FileSplit)context.getInputSplit(); splitStartOffset = fileSplit.getStart(); //calculating end offset of current split splitEndOffset = splitStartOffset + fileSplit.getLength() - 1; filename = fileSplit.getPath().toUri().getPath(); filename = filename.replaceAll(JsonDataVaildationConstants.FORWARD_SLASH, JsonDataVaildationConstants.JSON_DOT).substring(1, filename.length()); }
Example 7
Source File: FastaInputFormat.java From Hadoop-BAM with MIT License | 5 votes |
public FastaRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); current_split_pos = 1; FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
Example 8
Source File: TFileRecordReader.java From spork with Apache License 2.0 | 5 votes |
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); fileIn = fs.open(split.getPath()); reader = new Reader(fileIn, fs.getFileStatus(file).getLen(), job); scanner = reader.createScannerByByteRange(start, split.getLength()); }
Example 9
Source File: EnclosedBaseJsonRecordReader.java From spatial-framework-for-hadoop with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit split, TaskAttemptContext taskContext) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit)split; splitLen = fileSplit.getLength(); // using MRv2 commonInit(fileSplit.getPath(), taskContext.getConfiguration()); }
Example 10
Source File: XmlInputFormat.java From hiped2 with Apache License 2.0 | 5 votes |
public XmlRecordReader(FileSplit split, Configuration conf) throws IOException { startTag = conf.get(START_TAG_KEY).getBytes("UTF-8"); endTag = conf.get(END_TAG_KEY).getBytes("UTF-8"); // open the file and seek to the start of the split start = split.getStart(); end = start + split.getLength(); Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); fsin = fs.open(split.getPath()); fsin.seek(start); }
Example 11
Source File: MultiLineInputFormat.java From dkpro-c4corpus with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { numberOfLinesToProcess = getNumLinesPerSplit(context); FileSplit split = (FileSplit) genericSplit; final Path file = split.getPath(); Configuration conf = context.getConfiguration(); this.maxLineLength = conf .getInt("mapreduce.input.linerecordreader.line.maxlength", Integer.MAX_VALUE); FileSystem fs = file.getFileSystem(conf); start = split.getStart(); end = start + split.getLength(); boolean skipFirstLine = false; FSDataInputStream filein = fs.open(split.getPath()); if (start != 0) { skipFirstLine = true; --start; filein.seek(start); } in = new LineReader(filein, conf); if (skipFirstLine) { start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
Example 12
Source File: ResourceRecordReader.java From webarchive-commons with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { if(inputSplit instanceof FileSplit) { FileSplit fs = (FileSplit) inputSplit; Path fsPath = fs.getPath(); FileSystem fSys = fsPath.getFileSystem(context.getConfiguration()); FSDataInputStream fsdis = fSys.open(fsPath); String path = fsPath.getName(); name = fsPath.getName(); stream = new HDFSStream(fsdis); startOffset = fs.getStart(); length = fs.getLength(); long endOffset = startOffset + length; stream.setOffset(startOffset); series = new GZIPMemberSeries(stream, name, startOffset); GZIPResourceContainer prod = new GZIPResourceContainer(series,endOffset); ResourceProducer envelope; if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) { envelope = new TransformingResourceProducer(prod,wf); } else if(path.endsWith(".arc.gz")) { envelope = new TransformingResourceProducer(prod,af); } else { throw new IOException("arguments must be arc.gz or warc.gz"); } ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); producer = new ExtractingResourceProducer(envelope, mapper); } else { throw new IOException("Need FileSplit input..."); } }
Example 13
Source File: JsonFileRecordReader.java From jumbune with GNU Lesser General Public License v3.0 | 5 votes |
public JsonFileRecordReader(InputSplit split, TaskAttemptContext context) throws IOException{ FileSplit fileSplit = (FileSplit) split; start = fileSplit.getStart(); end = start + fileSplit.getLength(); Path file = fileSplit.getPath(); FileSystem fileSystem = file.getFileSystem(context.getConfiguration()); fSDataInputStream = fileSystem.open(fileSplit.getPath()); fSDataInputStream.seek(start); }
Example 14
Source File: TFileRecordReader.java From tez with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; LOG.info("Initializing TFileRecordReader : " + fileSplit.getPath().toString()); start = fileSplit.getStart(); end = start + fileSplit.getLength(); FileSystem fs = fileSplit.getPath().getFileSystem(context.getConfiguration()); splitPath = fileSplit.getPath(); fin = fs.open(splitPath); reader = new TFile.Reader(fin, fs.getFileStatus(splitPath).getLen(), context.getConfiguration()); scanner = reader.createScannerByByteRange(start, fileSplit.getLength()); }
Example 15
Source File: AvroArrayReader.java From spork with Apache License 2.0 | 5 votes |
@Override public void initialize(final InputSplit isplit, final TaskAttemptContext tc) throws IOException, InterruptedException { FileSplit fsplit = (FileSplit) isplit; start = fsplit.getStart(); end = fsplit.getStart() + fsplit.getLength(); DatumReader<GenericData.Array<Object>> datumReader = new GenericDatumReader<GenericData.Array<Object>>(schema); reader = DataFileReader.openReader( new FsInput(fsplit.getPath(), tc.getConfiguration()), datumReader); reader.sync(start); }
Example 16
Source File: QseqInputFormat.java From Hadoop-BAM with MIT License | 5 votes |
public QseqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
Example 17
Source File: FastqInputFormat.java From Hadoop-BAM with MIT License | 5 votes |
public FastqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
Example 18
Source File: JsonRecordReader.java From datawave with Apache License 2.0 | 5 votes |
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException { super.initialize(split, context); if (!(split instanceof FileSplit)) { throw new IOException("Cannot handle split type " + split.getClass().getName()); } FileSplit fsplit = (FileSplit) split; Path file = fsplit.getPath(); rawFileName = file.getName(); fileURI = file.toUri(); FileSystem fs = file.getFileSystem(context.getConfiguration()); InputStream is = fs.open(file); start = fsplit.getStart(); end = start + fsplit.getLength(); pos = start; String normURI = fileURI.getScheme() + "://" + fileURI.getPath(); setupReader(is); if (logger.isInfoEnabled()) { logger.info("Reading Json records from " + normURI + " via " + is.getClass().getName()); } jsonHelper = (JsonDataTypeHelper) createHelper(context.getConfiguration()); this.parseHeaderOnly = !jsonHelper.processExtraFields(); jsonFlattener = jsonHelper.newFlattener(); if (logger.isInfoEnabled()) { logger.info("Json flattener mode: " + jsonFlattener.getFlattenMode().name()); } }
Example 19
Source File: DataValidationMapper.java From jumbune with GNU Lesser General Public License v3.0 | 4 votes |
@SuppressWarnings("rawtypes") protected void setup(Mapper.Context context) throws IOException, InterruptedException { lineNumber = 0; recordsEmittByMap = 0l ; noOfToupleProcessd = 0l; cleanTupleCounter=0l; lineVWCounterNOF = 0; lineVWCounterNC = 0; lineVWCounterDT = 0; lineVWCounterRX = 0; dataValidatoinDiscripancies = new DataDiscrepanciesArrayWritable(); dataViolationWBNOF = new DataViolationWB(); dataViolationWBNC = new DataViolationWB(); dataViolationWBDT = new DataViolationWB(); dataViolationWBRX = new DataViolationWB(); dataViolationWBArr = new DataViolationWB[1]; // populating data validation parameters String dvBeanString = context.getConfiguration().get(DataValidationConstants.DATA_VALIDATION_BEAN_STRING); String validateMatrix = context.getConfiguration().get(DataValidationConstants.VALIDATE_MATRIX); Gson gson = new Gson(); Type type = new TypeToken<DataValidationBean>() { }.getType(); DataValidationBean dataValidationBean = gson.fromJson(dvBeanString, type); fieldSeparator = dataValidationBean.getFieldSeparator(); fieldSeparator = fieldSeparator.replaceAll(Constants.SPACE_SEPARATOR, Constants.SPACE); fieldValidationList = dataValidationBean.getFieldValidationList(); expectedNumOfFields = dataValidationBean.getNumOfFields(); validateArray = gson.fromJson(validateMatrix, boolean[][].class); keyPattern = new LRUCache<String, Pattern>(expectedNumOfFields) { /** * */ private static final long serialVersionUID = 8594885637377460020L; @Override protected boolean removeEldestEntry(java.util.Map.Entry<String, Pattern> eldest) { if (size() > super.getCapacity()) { return true; } return false; } }; FileSplit split = ((FileSplit) context.getInputSplit()); splitStartOffset = split.getStart(); //calculating end offset of current split splitEndOffset = splitStartOffset + split.getLength() - 1; fileName = split.getPath().toUri().getPath(); fileName = fileName.replaceAll("/", ".").substring(1, fileName.length()); }
Example 20
Source File: LongLineEventRecordReader.java From datawave with Apache License 2.0 | 4 votes |
/** * @param genericSplit * @param context * @throws IOException */ public void initializeLineReader(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LfLineReader(codec.createInputStream(fileIn), job); in.setNewLineIncluded(newLineIncluded); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } // Hadoop CodecFactory only checks the file suffix, let's double check for gzip since some data producers // may not append .gz to their files. InputStream iStream = GzipDetectionUtil.decompressTream(fileIn); Class streamClass = iStream.getClass(); if (GZIPInputStream.class == streamClass) { end = Long.MAX_VALUE; } in = new LfLineReader(iStream, job); in.setNewLineIncluded(newLineIncluded); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start)); } this.pos = start; }