Java Code Examples for org.apache.hadoop.mapreduce.InputSplit#getLength()

The following examples show how to use org.apache.hadoop.mapreduce.InputSplit#getLength() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.

Example 1

Source File: AbstractHadoopJob.java From kylin-on-parquet-v2 with Apache License 2.0

6 votes

public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    
    // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) 
    if (mapInputBytes == 0) {
        logger.warn("Map input splits are 0 bytes, something is wrong?");
    }
    
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

Example 2

Source File: MapRedUtil.java From spork with Apache License 2.0

6 votes

public String inputSplitToString(InputSplit[] splits) throws IOException, InterruptedException {
    // debugging purpose only
    StringBuilder st = new StringBuilder();
    st.append("Number of splits :" + splits.length+"\n");
    long len = 0;
    for (InputSplit split: splits)
        len += split.getLength();
    st.append("Total Length = "+ len +"\n");
    for (int i = 0; i < splits.length; i++) {
        st.append("Input split["+i+"]:\n   Length = "+ splits[i].getLength()+"\n  Locations:\n");
        for (String location :  splits[i].getLocations())
            st.append("    "+location+"\n");
        st.append("\n-----------------------\n");
    }
    return st.toString();
}

Example 3

Source File: AbstractHadoopJob.java From kylin with Apache License 2.0

6 votes

public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    
    // 0 input bytes is possible when the segment range hits no partition on a partitioned hive table (KYLIN-2470) 
    if (mapInputBytes == 0) {
        logger.warn("Map input splits are 0 bytes, something is wrong?");
    }
    
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

Example 4

Source File: AbstractHadoopJob.java From Kylin with Apache License 2.0

6 votes

protected double getTotalMapInputMB() throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    if (mapInputBytes == 0) {
        throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!");
    }
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

Example 5

Source File: AggregateXMLReader.java From marklogic-contentpump with Apache License 2.0

6 votes

protected void initStreamReader(InputSplit inSplit) throws IOException,
    InterruptedException {
    start = 0;
    end = inSplit.getLength();
    overflow = false;
    fInputStream = openFile(inSplit, true);
    if (fInputStream == null) {
        return;
    }

    try {
        xmlSR = f.createXMLStreamReader(fInputStream, encoding);
    } catch (XMLStreamException e) {
        LOG.error(e.getMessage(), e);
    }

    if (useAutomaticId) {
        idGen = new IdGenerator(file.toUri().getPath() + "-"
            + ((FileSplit) inSplit).getStart());
    }
}

Example 6

Source File: BinaryLoader.java From marklogic-contentpump with Apache License 2.0

6 votes

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    bytesTotal = inSplit.getLength();
    Path file = ((FileSplit)inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    key.set(file.toString());
    byte[] buf = new byte[(int)inSplit.getLength()];
    System.out.println("split length: " + inSplit.getLength());
    try {
        fileIn.readFully(buf);
        value.set(buf, 0, (int) inSplit.getLength());
        System.out.println("value length: " + value.getBytes().length);
        
        hasNext = true;    
    } catch (Exception e) {
        hasNext = false;
    } finally {
        fileIn.close();
    }
}

Example 7

Source File: DelimitedTextReader.java From marklogic-contentpump with Apache License 2.0

6 votes

protected void initParser(InputSplit inSplit) throws IOException,
    InterruptedException {
    fileIn = openFile(inSplit, true);
    if (fileIn == null) {
        return;
    }
    instream = new InputStreamReader(fileIn, encoding);

    bytesRead = 0;
    fileLen = inSplit.getLength();
    if (uriName == null) {
        generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false);
        if (generateId) {
            idGen = new IdGenerator(file.toUri().getPath() + "-"
                + ((FileSplit) inSplit).getStart());
        } else {
            uriId = 0;
        }
    }
    parser = new CSVParser(instream, CSVParserFormatter.
    		getFormat(delimiter, encapsulator, true,
    				true));
    parserIterator = parser.iterator();
}

Example 8

Source File: HadoopFormatIO.java From beam with Apache License 2.0

6 votes

/**
 * This is a helper function to compute splits. This method will also calculate size of the data
 * being read. Note: This method is executed exactly once and the splits are retrieved and
 * cached in this. These splits are further used by split() and getEstimatedSizeBytes().
 */
@VisibleForTesting
void computeSplitsIfNecessary() throws IOException, InterruptedException {
  if (inputSplits != null) {
    return;
  }
  createInputFormatInstance();
  List<InputSplit> splits = inputFormatObj.getSplits(Job.getInstance(conf.get()));
  if (splits == null) {
    throw new IOException("Error in computing splits, getSplits() returns null.");
  }
  if (splits.isEmpty()) {
    throw new IOException("Error in computing splits, getSplits() returns a empty list");
  }
  boundedSourceEstimatedSize = 0;
  inputSplits = new ArrayList<>();
  for (InputSplit inputSplit : splits) {
    if (inputSplit == null) {
      throw new IOException(
          "Error in computing splits, split is null in InputSplits list "
              + "populated by getSplits() : ");
    }
    boundedSourceEstimatedSize += inputSplit.getLength();
    inputSplits.add(new SerializableSplit(inputSplit));
  }
}

Example 9

Source File: CSVReaderBase.java From datawave with Apache License 2.0

5 votes

public void initializeTotalSize(final InputSplit genericSplit) throws IOException {
    try {
        totalSize = genericSplit.getLength() * 4l;
    } catch (InterruptedException ex) {
        throw new IOException("Interrupted Exception thrown while attempting to get split length", ex);
    }
}

Example 10

Source File: JobSplit.java From big-c with Apache License 2.0

5 votes

public SplitMetaInfo(InputSplit split, long startOffset) throws IOException {
  try {
    this.locations = split.getLocations();
    this.inputDataLength = split.getLength();
    this.startOffset = startOffset;
  } catch (InterruptedException ie) {
    throw new IOException(ie);
  }
}

Example 11

Source File: SequenceFileAsBinaryInputFormat.java From big-c with Apache License 2.0

5 votes

public void initialize(InputSplit split, TaskAttemptContext context) 
    throws IOException, InterruptedException {
  Path path = ((FileSplit)split).getPath();
  Configuration conf = context.getConfiguration();
  FileSystem fs = path.getFileSystem(conf);
  this.in = new SequenceFile.Reader(fs, path, conf);
  this.end = ((FileSplit)split).getStart() + split.getLength();
  if (((FileSplit)split).getStart() > in.getPosition()) {
    in.sync(((FileSplit)split).getStart());    // sync to start
  }
  this.start = in.getPosition();
  vbytes = in.createValueBytes();
  done = start >= end;
}

Example 12

Source File: CompositeInputSplit.java From big-c with Apache License 2.0

5 votes

/**
 * Add an InputSplit to this collection.
 * @throws IOException If capacity was not specified during construction
 *                     or if capacity has been reached.
 */
public void add(InputSplit s) throws IOException, InterruptedException {
  if (null == splits) {
    throw new IOException("Uninitialized InputSplit");
  }
  if (fill == splits.length) {
    throw new IOException("Too many splits");
  }
  splits[fill++] = s;
  totsize += s.getLength();
}

Example 13

Source File: DelimitedJSONReader.java From marklogic-contentpump with Apache License 2.0

5 votes

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    /* Initialization in super class */
    initConfig(context);  
    /*  Get file(s) in input split */
    setFile(((FileSplit) inSplit).getPath());
    // Initialize reader properties
    generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI,false);
    if (generateId){
        idGen = new IdGenerator(file.toUri().getPath() + "-"
                + ((FileSplit) inSplit).getStart()); 
    } else {
        uriName = conf.get(CONF_INPUT_URI_ID, null);
        mapper = new ObjectMapper();
    }
    bytesRead = 0;
    totalBytes = inSplit.getLength();
    /* Check file status */
    fs = file.getFileSystem(context.getConfiguration());
    FileStatus status = fs.getFileStatus(file);
    if (status.isDirectory()) {
        iterator = new FileIterator((FileSplit)inSplit, context);
        inSplit = iterator.next();
    }
    /* Initialize buffered reader */
    initFileStream(inSplit);
}

Example 14

Source File: RDFReader.java From marklogic-contentpump with Apache License 2.0

5 votes

protected void initStream(InputSplit inSplit)
        throws IOException, InterruptedException {
    FSDataInputStream in = openFile(inSplit, false);
    if (in == null) {
        return;
    }
    long size = inSplit.getLength();
    initParser(file.toUri().toASCIIString(), size);
    parse(file.getName(), in);
}

Example 15

Source File: TezGroupedSplit.java From incubator-tez with Apache License 2.0

5 votes

public void addSplit(InputSplit split) {
  wrappedSplits.add(split);
  try {
    length += split.getLength();
  } catch (Exception e) {
    throw new TezUncheckedException(e);
  }
}

Example 16

Source File: SequenceFileAsBinaryInputFormat.java From hadoop with Apache License 2.0

5 votes

public void initialize(InputSplit split, TaskAttemptContext context) 
    throws IOException, InterruptedException {
  Path path = ((FileSplit)split).getPath();
  Configuration conf = context.getConfiguration();
  FileSystem fs = path.getFileSystem(conf);
  this.in = new SequenceFile.Reader(fs, path, conf);
  this.end = ((FileSplit)split).getStart() + split.getLength();
  if (((FileSplit)split).getStart() > in.getPosition()) {
    in.sync(((FileSplit)split).getStart());    // sync to start
  }
  this.start = in.getPosition();
  vbytes = in.createValueBytes();
  done = start >= end;
}

Example 17

Source File: CompositeInputSplit.java From hadoop with Apache License 2.0

5 votes

/**
 * Add an InputSplit to this collection.
 * @throws IOException If capacity was not specified during construction
 *                     or if capacity has been reached.
 */
public void add(InputSplit s) throws IOException, InterruptedException {
  if (null == splits) {
    throw new IOException("Uninitialized InputSplit");
  }
  if (fill == splits.length) {
    throw new IOException("Too many splits");
  }
  splits[fill++] = s;
  totsize += s.getLength();
}

Example 18

Source File: TabletSplitSplit.java From datawave with Apache License 2.0

5 votes

/**
 * Add an InputSplit to this collection.
 * 
 * @throws IOException
 *             If capacity was not specified during construction or if capacity has been reached.
 * @throws InterruptedException
 */
public void add(InputSplit s) throws IOException, InterruptedException {
    if (null == splits) {
        throw new IOException("Uninitialized InputSplit");
    }
    if (fill == splits.length) {
        throw new IOException("Too many splits");
    }
    
    splits[fill++] = s;
    totsize += s.getLength();
}

Example 19

Source File: WikiLoader.java From marklogic-contentpump with Apache License 2.0

4 votes

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    Path file = ((FileSplit)inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    byte[] buf = new byte[BUFFER_SIZE];
    long bytesTotal = inSplit.getLength();
    long start = ((FileSplit)inSplit).getStart();
    fileIn.seek(start);
    long bytesRead = 0;
    StringBuilder pages = new StringBuilder();
    int sindex = -1;
    while (true) {
        int length = (int)Math.min(bytesTotal - bytesRead, buf.length);
        int read = fileIn.read(buf, 0, length);
        if (read == -1) {
            System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal +
                    "bytesRead=" + bytesRead);
            break;
        }
        bytesRead += read;  
        String temp = new String(new String(buf, 0, read));
        if (sindex == -1) { // haven't found the start yet    
            sindex = temp.indexOf(BEGIN_PAGE_TAG);
            if (sindex > -1) {
                pages.append(temp.substring(sindex));
            }
        } else if (bytesRead < bytesTotal) { // haven't completed the split
            pages.append(temp);
        } else { // reached the end of this split
            // look for end
            int eindex = 0;
            if (temp.contains(END_DOC_TAG) || // reached the end of doc
                temp.endsWith(END_PAGE_TAG)) {
                eindex = temp.lastIndexOf(END_PAGE_TAG);
                pages.append(temp.substring(0, 
                    eindex + END_PAGE_TAG.length()));   
                System.out.println("Found end of doc.");
            } else { // need to read ahead to look for end of page
                while (true) {
                    read = fileIn.read(buf, 0, READ_AHEAD_SIZE);
                    if (read == -1) { // no more to read
                        System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal +
                                "bytesRead=" + bytesRead);
                        System.out.println(temp);
                        break;
                    }
                    bytesRead += read;
                    // look for end
                    temp = new String(buf, 0, read);
                    eindex = temp.indexOf(END_PAGE_TAG);
                    if (eindex > -1) {
                        pages.append(temp.substring(0, 
                                eindex + END_PAGE_TAG.length()));
                        break;
                    } else {
                        pages.append(temp);
                    }
                }
            }
            break;
        }
    }
    fileIn.close();
    articles = WikiModelProcessor.process(pages);
}

Example 20

Source File: JobSplit.java From big-c with Apache License 2.0

4 votes

public TaskSplitMetaInfo(InputSplit split, long startOffset) 
throws InterruptedException, IOException {
  this(new TaskSplitIndex("", startOffset), split.getLocations(), 
      split.getLength());
}