Java Code Examples for org.apache.hadoop.mapreduce.InputSplit#getLength()

The following examples show how to use org.apache.hadoop.mapreduce.InputSplit#getLength() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AbstractHadoopJob.java    From kylin-on-parquet-v2 with Apache License 2.0 6 votes vote down vote up
public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    
    // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) 
    if (mapInputBytes == 0) {
        logger.warn("Map input splits are 0 bytes, something is wrong?");
    }
    
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}
 
Example 2
Source File: MapRedUtil.java    From spork with Apache License 2.0 6 votes vote down vote up
public String inputSplitToString(InputSplit[] splits) throws IOException, InterruptedException {
    // debugging purpose only
    StringBuilder st = new StringBuilder();
    st.append("Number of splits :" + splits.length+"\n");
    long len = 0;
    for (InputSplit split: splits)
        len += split.getLength();
    st.append("Total Length = "+ len +"\n");
    for (int i = 0; i < splits.length; i++) {
        st.append("Input split["+i+"]:\n   Length = "+ splits[i].getLength()+"\n  Locations:\n");
        for (String location :  splits[i].getLocations())
            st.append("    "+location+"\n");
        st.append("\n-----------------------\n");
    }
    return st.toString();
}
 
Example 3
Source File: AbstractHadoopJob.java    From kylin with Apache License 2.0 6 votes vote down vote up
public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    
    // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) 
    if (mapInputBytes == 0) {
        logger.warn("Map input splits are 0 bytes, something is wrong?");
    }
    
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}
 
Example 4
Source File: AbstractHadoopJob.java    From Kylin with Apache License 2.0 6 votes vote down vote up
protected double getTotalMapInputMB() throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    if (mapInputBytes == 0) {
        throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!");
    }
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}
 
Example 5
Source File: AggregateXMLReader.java    From marklogic-contentpump with Apache License 2.0 6 votes vote down vote up
protected void initStreamReader(InputSplit inSplit) throws IOException,
    InterruptedException {
    start = 0;
    end = inSplit.getLength();
    overflow = false;
    fInputStream = openFile(inSplit, true);
    if (fInputStream == null) {
        return;
    }

    try {
        xmlSR = f.createXMLStreamReader(fInputStream, encoding);
    } catch (XMLStreamException e) {
        LOG.error(e.getMessage(), e);
    }

    if (useAutomaticId) {
        idGen = new IdGenerator(file.toUri().getPath() + "-"
            + ((FileSplit) inSplit).getStart());
    }
}
 
Example 6
Source File: BinaryLoader.java    From marklogic-contentpump with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    bytesTotal = inSplit.getLength();
    Path file = ((FileSplit)inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    key.set(file.toString());
    byte[] buf = new byte[(int)inSplit.getLength()];
    System.out.println("split length: " + inSplit.getLength());
    try {
        fileIn.readFully(buf);
        value.set(buf, 0, (int) inSplit.getLength());
        System.out.println("value length: " + value.getBytes().length);
        
        hasNext = true;    
    } catch (Exception e) {
        hasNext = false;
    } finally {
        fileIn.close();
    }
}
 
Example 7
Source File: DelimitedTextReader.java    From marklogic-contentpump with Apache License 2.0 6 votes vote down vote up
protected void initParser(InputSplit inSplit) throws IOException,
    InterruptedException {
    fileIn = openFile(inSplit, true);
    if (fileIn == null) {
        return;
    }
    instream = new InputStreamReader(fileIn, encoding);

    bytesRead = 0;
    fileLen = inSplit.getLength();
    if (uriName == null) {
        generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false);
        if (generateId) {
            idGen = new IdGenerator(file.toUri().getPath() + "-"
                + ((FileSplit) inSplit).getStart());
        } else {
            uriId = 0;
        }
    }
    parser = new CSVParser(instream, CSVParserFormatter.
    		getFormat(delimiter, encapsulator, true,
    				true));
    parserIterator = parser.iterator();
}
 
Example 8
Source File: HadoopFormatIO.java    From beam with Apache License 2.0 6 votes vote down vote up
/**
 * This is a helper function to compute splits. This method will also calculate size of the data
 * being read. Note: This method is executed exactly once and the splits are retrieved and
 * cached in this. These splits are further used by split() and getEstimatedSizeBytes().
 */
@VisibleForTesting
void computeSplitsIfNecessary() throws IOException, InterruptedException {
  if (inputSplits != null) {
    return;
  }
  createInputFormatInstance();
  List<InputSplit> splits = inputFormatObj.getSplits(Job.getInstance(conf.get()));
  if (splits == null) {
    throw new IOException("Error in computing splits, getSplits() returns null.");
  }
  if (splits.isEmpty()) {
    throw new IOException("Error in computing splits, getSplits() returns a empty list");
  }
  boundedSourceEstimatedSize = 0;
  inputSplits = new ArrayList<>();
  for (InputSplit inputSplit : splits) {
    if (inputSplit == null) {
      throw new IOException(
          "Error in computing splits, split is null in InputSplits list "
              + "populated by getSplits() : ");
    }
    boundedSourceEstimatedSize += inputSplit.getLength();
    inputSplits.add(new SerializableSplit(inputSplit));
  }
}
 
Example 9
Source File: CSVReaderBase.java    From datawave with Apache License 2.0 5 votes vote down vote up
public void initializeTotalSize(final InputSplit genericSplit) throws IOException {
    try {
        totalSize = genericSplit.getLength() * 4l;
    } catch (InterruptedException ex) {
        throw new IOException("Interrupted Exception thrown while attempting to get split length", ex);
    }
}
 
Example 10
Source File: JobSplit.java    From big-c with Apache License 2.0 5 votes vote down vote up
public SplitMetaInfo(InputSplit split, long startOffset) throws IOException {
  try {
    this.locations = split.getLocations();
    this.inputDataLength = split.getLength();
    this.startOffset = startOffset;
  } catch (InterruptedException ie) {
    throw new IOException(ie);
  }
}
 
Example 11
Source File: SequenceFileAsBinaryInputFormat.java    From big-c with Apache License 2.0 5 votes vote down vote up
public void initialize(InputSplit split, TaskAttemptContext context) 
    throws IOException, InterruptedException {
  Path path = ((FileSplit)split).getPath();
  Configuration conf = context.getConfiguration();
  FileSystem fs = path.getFileSystem(conf);
  this.in = new SequenceFile.Reader(fs, path, conf);
  this.end = ((FileSplit)split).getStart() + split.getLength();
  if (((FileSplit)split).getStart() > in.getPosition()) {
    in.sync(((FileSplit)split).getStart());    // sync to start
  }
  this.start = in.getPosition();
  vbytes = in.createValueBytes();
  done = start >= end;
}
 
Example 12
Source File: CompositeInputSplit.java    From big-c with Apache License 2.0 5 votes vote down vote up
/**
 * Add an InputSplit to this collection.
 * @throws IOException If capacity was not specified during construction
 *                     or if capacity has been reached.
 */
public void add(InputSplit s) throws IOException, InterruptedException {
  if (null == splits) {
    throw new IOException("Uninitialized InputSplit");
  }
  if (fill == splits.length) {
    throw new IOException("Too many splits");
  }
  splits[fill++] = s;
  totsize += s.getLength();
}
 
Example 13
Source File: DelimitedJSONReader.java    From marklogic-contentpump with Apache License 2.0 5 votes vote down vote up
@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    /* Initialization in super class */
    initConfig(context);  
    /*  Get file(s) in input split */
    setFile(((FileSplit) inSplit).getPath());
    // Initialize reader properties
    generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI,false);
    if (generateId){
        idGen = new IdGenerator(file.toUri().getPath() + "-"
                + ((FileSplit) inSplit).getStart()); 
    } else {
        uriName = conf.get(CONF_INPUT_URI_ID, null);
        mapper = new ObjectMapper();
    }
    bytesRead = 0;
    totalBytes = inSplit.getLength();
    /* Check file status */
    fs = file.getFileSystem(context.getConfiguration());
    FileStatus status = fs.getFileStatus(file);
    if (status.isDirectory()) {
        iterator = new FileIterator((FileSplit)inSplit, context);
        inSplit = iterator.next();
    }
    /* Initialize buffered reader */
    initFileStream(inSplit);
}
 
Example 14
Source File: RDFReader.java    From marklogic-contentpump with Apache License 2.0 5 votes vote down vote up
protected void initStream(InputSplit inSplit)
        throws IOException, InterruptedException {
    FSDataInputStream in = openFile(inSplit, false);
    if (in == null) {
        return;
    }
    long size = inSplit.getLength();
    initParser(file.toUri().toASCIIString(), size);
    parse(file.getName(), in);
}
 
Example 15
Source File: TezGroupedSplit.java    From incubator-tez with Apache License 2.0 5 votes vote down vote up
public void addSplit(InputSplit split) {
  wrappedSplits.add(split);
  try {
    length += split.getLength();
  } catch (Exception e) {
    throw new TezUncheckedException(e);
  }
}
 
Example 16
Source File: SequenceFileAsBinaryInputFormat.java    From hadoop with Apache License 2.0 5 votes vote down vote up
public void initialize(InputSplit split, TaskAttemptContext context) 
    throws IOException, InterruptedException {
  Path path = ((FileSplit)split).getPath();
  Configuration conf = context.getConfiguration();
  FileSystem fs = path.getFileSystem(conf);
  this.in = new SequenceFile.Reader(fs, path, conf);
  this.end = ((FileSplit)split).getStart() + split.getLength();
  if (((FileSplit)split).getStart() > in.getPosition()) {
    in.sync(((FileSplit)split).getStart());    // sync to start
  }
  this.start = in.getPosition();
  vbytes = in.createValueBytes();
  done = start >= end;
}
 
Example 17
Source File: CompositeInputSplit.java    From hadoop with Apache License 2.0 5 votes vote down vote up
/**
 * Add an InputSplit to this collection.
 * @throws IOException If capacity was not specified during construction
 *                     or if capacity has been reached.
 */
public void add(InputSplit s) throws IOException, InterruptedException {
  if (null == splits) {
    throw new IOException("Uninitialized InputSplit");
  }
  if (fill == splits.length) {
    throw new IOException("Too many splits");
  }
  splits[fill++] = s;
  totsize += s.getLength();
}
 
Example 18
Source File: TabletSplitSplit.java    From datawave with Apache License 2.0 5 votes vote down vote up
/**
 * Add an InputSplit to this collection.
 * 
 * @throws IOException
 *             If capacity was not specified during construction or if capacity has been reached.
 * @throws InterruptedException
 */
public void add(InputSplit s) throws IOException, InterruptedException {
    if (null == splits) {
        throw new IOException("Uninitialized InputSplit");
    }
    if (fill == splits.length) {
        throw new IOException("Too many splits");
    }
    
    splits[fill++] = s;
    totsize += s.getLength();
}
 
Example 19
Source File: WikiLoader.java    From marklogic-contentpump with Apache License 2.0 4 votes vote down vote up
@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    Path file = ((FileSplit)inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    byte[] buf = new byte[BUFFER_SIZE];
    long bytesTotal = inSplit.getLength();
    long start = ((FileSplit)inSplit).getStart();
    fileIn.seek(start);
    long bytesRead = 0;
    StringBuilder pages = new StringBuilder();
    int sindex = -1;
    while (true) {
        int length = (int)Math.min(bytesTotal - bytesRead, buf.length);
        int read = fileIn.read(buf, 0, length);
        if (read == -1) {
            System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal +
                    "bytesRead=" + bytesRead);
            break;
        }
        bytesRead += read;  
        String temp = new String(new String(buf, 0, read));
        if (sindex == -1) { // haven't found the start yet    
            sindex = temp.indexOf(BEGIN_PAGE_TAG);
            if (sindex > -1) {
                pages.append(temp.substring(sindex));
            }
        } else if (bytesRead < bytesTotal) { // haven't completed the split
            pages.append(temp);
        } else { // reached the end of this split
            // look for end
            int eindex = 0;
            if (temp.contains(END_DOC_TAG) || // reached the end of doc
                temp.endsWith(END_PAGE_TAG)) {
                eindex = temp.lastIndexOf(END_PAGE_TAG);
                pages.append(temp.substring(0, 
                    eindex + END_PAGE_TAG.length()));   
                System.out.println("Found end of doc.");
            } else { // need to read ahead to look for end of page
                while (true) {
                    read = fileIn.read(buf, 0, READ_AHEAD_SIZE);
                    if (read == -1) { // no more to read
                        System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal +
                                "bytesRead=" + bytesRead);
                        System.out.println(temp);
                        break;
                    }
                    bytesRead += read;
                    // look for end
                    temp = new String(buf, 0, read);
                    eindex = temp.indexOf(END_PAGE_TAG);
                    if (eindex > -1) {
                        pages.append(temp.substring(0, 
                                eindex + END_PAGE_TAG.length()));
                        break;
                    } else {
                        pages.append(temp);
                    }
                }
            }
            break;
        }
    }
    fileIn.close();
    articles = WikiModelProcessor.process(pages);
}
 
Example 20
Source File: JobSplit.java    From big-c with Apache License 2.0 4 votes vote down vote up
public TaskSplitMetaInfo(InputSplit split, long startOffset) 
throws InterruptedException, IOException {
  this(new TaskSplitIndex("", startOffset), split.getLocations(), 
      split.getLength());
}