java source code of BAMInputFormat

// Copyright (c) 2010 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

// File created: 2010-08-03 11:50:19

package org.seqdoop.hadoop_bam;

import htsjdk.samtools.AbstractBAMFileIndex;
import htsjdk.samtools.BAMFileReader;
import htsjdk.samtools.BAMFileSpan;
import htsjdk.samtools.BAMIndex;
import htsjdk.samtools.Chunk;
import htsjdk.samtools.LinearBAMIndex;
import htsjdk.samtools.LinearIndex;
import htsjdk.samtools.QueryInterval;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileSpan;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.Locatable;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.seqdoop.hadoop_bam.util.IntervalUtil;
import org.seqdoop.hadoop_bam.util.NIOFileUtil;
import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
import org.seqdoop.hadoop_bam.util.WrapSeekable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.file.ProviderNotFoundException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import htsjdk.samtools.seekablestream.SeekableStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

/** An {@link org.apache.hadoop.mapreduce.InputFormat} for BAM files. Values
 * are the individual records; see {@link BAMRecordReader} for the meaning of
 * the key.
 */
public class BAMInputFormat
	extends FileInputFormat<LongWritable,SAMRecordWritable>
{
	private static final Logger logger = LoggerFactory.getLogger(BAMInputFormat.class);

	/**
	 * If set to true, only include reads that overlap the given intervals (if specified),
	 * and unplaced unmapped reads (if specified). For programmatic use
	 * {@link #setTraversalParameters(Configuration, List, boolean)} should be preferred.
	 */
	public static final String BOUNDED_TRAVERSAL_PROPERTY = "hadoopbam.bam.bounded-traversal";

	/**
	 * If set to true, enables the use of BAM indices to calculate splits.
	 * For programmatic use
	 * {@link #setEnableBAISplitCalculator(Configuration, boolean)} should be preferred.
         * By default, this split calculator is disabled in favor of the splitting-bai calculator.
	 */
	public static final String ENABLE_BAI_SPLIT_CALCULATOR = "hadoopbam.bam.enable-bai-splitter";
    
	/**
	 * Filter by region, like <code>-L</code> in SAMtools. Takes a comma-separated
	 * list of intervals, e.g. <code>chr1:1-20000,chr2:12000-20000</code>. For
	 * programmatic use {@link #setIntervals(Configuration, List)} should be preferred.
	 */
	public static final String INTERVALS_PROPERTY = "hadoopbam.bam.intervals";

	/**
	 * If set to true, include unplaced unmapped reads (that is, unmapped reads with no
	 * position). For programmatic use
	 * {@link #setTraversalParameters(Configuration, List, boolean)} should be preferred.
	 */
	public static final String TRAVERSE_UNPLACED_UNMAPPED_PROPERTY = "hadoopbam.bam.traverse-unplaced-unmapped";

	/**
	 * If set to true, use the Intel inflater for decompressing DEFLATE compressed streams.
	 * If set, the <a href="https://github.com/Intel-HLS/GKL">GKL library</a> must be
	 * provided on the classpath.
	 */
	public static final String USE_INTEL_INFLATER_PROPERTY = "hadoopbam.bam.use-intel-inflater";

	/**
	 * Only include reads that overlap the given intervals. Unplaced unmapped reads are not
	 * included.
	 * @param conf the Hadoop configuration to set properties on
	 * @param intervals the intervals to filter by
	 * @param <T> the {@link Locatable} type
	 */
	public static <T extends Locatable> void setIntervals(Configuration conf,
			List<T> intervals) {
		setTraversalParameters(conf, intervals, false);
	}

        /**
         * Enables or disables the split calculator that uses the BAM index to calculate splits.
         */
        public static void setEnableBAISplitCalculator(Configuration conf,
                        boolean setEnabled) {
            conf.setBoolean(ENABLE_BAI_SPLIT_CALCULATOR, setEnabled);
        }

	/**
	 * Only include reads that overlap the given intervals (if specified) and unplaced
	 * unmapped reads (if <code>true</code>).
	 * @param conf the Hadoop configuration to set properties on
	 * @param intervals the intervals to filter by, or <code>null</code> if all reads
	 *   are to be included (in which case <code>traverseUnplacedUnmapped</code> must be
	 *   <code>true</code>)
	 * @param traverseUnplacedUnmapped whether to included unplaced unampped reads
	 * @param <T> the {@link Locatable} type
	 */
	public static <T extends Locatable> void setTraversalParameters(Configuration conf,
			List<T> intervals, boolean traverseUnplacedUnmapped) {
		if (intervals == null && !traverseUnplacedUnmapped) {
			throw new IllegalArgumentException("Traversing mapped reads only is not supported.");
		}
		conf.setBoolean(BOUNDED_TRAVERSAL_PROPERTY, true);
		if (intervals != null) {
			StringBuilder sb = new StringBuilder();
			for (Iterator<T> it = intervals.iterator(); it.hasNext(); ) {
				Locatable l = it.next();
				sb.append(String.format("%s:%d-%d", l.getContig(), l.getStart(), l.getEnd()));
				if (it.hasNext()) {
					sb.append(",");
				}
			}
			conf.set(INTERVALS_PROPERTY, sb.toString());
		}
		conf.setBoolean(TRAVERSE_UNPLACED_UNMAPPED_PROPERTY, traverseUnplacedUnmapped);
	}

	/**
	 * Reset traversal parameters so that all reads are included.
	 * @param conf the Hadoop configuration to set properties on
	 */
	public static void unsetTraversalParameters(Configuration conf) {
		conf.unset(BOUNDED_TRAVERSAL_PROPERTY);
		conf.unset(INTERVALS_PROPERTY);
		conf.unset(TRAVERSE_UNPLACED_UNMAPPED_PROPERTY);
	}

	static boolean isBoundedTraversal(Configuration conf) {
		return conf.getBoolean(BOUNDED_TRAVERSAL_PROPERTY, false) ||
				conf.get(INTERVALS_PROPERTY) != null; // backwards compatibility
	}

	static boolean traverseUnplacedUnmapped(Configuration conf) {
		return conf.getBoolean(TRAVERSE_UNPLACED_UNMAPPED_PROPERTY, false);
	}

	static List<Interval> getIntervals(Configuration conf) {
		return IntervalUtil.getIntervals(conf, INTERVALS_PROPERTY);
	}

	static boolean useIntelInflater(Configuration conf) {
		return conf.getBoolean(USE_INTEL_INFLATER_PROPERTY, false);
	}

	static Path getIdxPath(Path path) {
		return path.suffix(SplittingBAMIndexer.OUTPUT_FILE_EXTENSION);
	}

	static List<InputSplit> removeIndexFiles(List<InputSplit> splits) {
		// Remove any splitting bai files
		return splits.stream()
				.filter(split -> !((FileSplit) split).getPath().getName().endsWith(
						SplittingBAMIndexer.OUTPUT_FILE_EXTENSION))
                                .filter(split -> !((FileSplit) split).getPath().getName().endsWith(
                                                BAMIndex.BAMIndexSuffix))
				.collect(Collectors.toList());
        }
    
    	static Path getBAIPath(Path path) {
		return path.suffix(BAMIndex.BAMIndexSuffix);
	}

	/** Returns a {@link BAMRecordReader} initialized with the parameters. */
	@Override public RecordReader<LongWritable,SAMRecordWritable>
		createRecordReader(InputSplit split, TaskAttemptContext ctx)
            throws InterruptedException, IOException
	{
		final RecordReader<LongWritable,SAMRecordWritable> rr =
			new BAMRecordReader();
		rr.initialize(split, ctx);
		return rr;
	}

	/** The splits returned are {@link FileVirtualSplit FileVirtualSplits}. */
	@Override public List<InputSplit> getSplits(JobContext job)
		throws IOException
	{
		return getSplits(super.getSplits(job), job.getConfiguration());
	}

	public List<InputSplit> getSplits(
			List<InputSplit> splits, Configuration cfg)
		throws IOException
	{

		final List<InputSplit> origSplits = removeIndexFiles(splits);

		// Align the splits so that they don't cross blocks.

		// addIndexedSplits() requires the given splits to be sorted by file
		// path, so do so. Although FileInputFormat.getSplits() does, at the time
		// of writing this, generate them in that order, we shouldn't rely on it.
		Collections.sort(origSplits, new Comparator<InputSplit>() {
			public int compare(InputSplit a, InputSplit b) {
				FileSplit fa = (FileSplit)a, fb = (FileSplit)b;
				return fa.getPath().compareTo(fb.getPath());
			}
		});

		final List<InputSplit> newSplits =
			new ArrayList<InputSplit>(origSplits.size());

		for (int i = 0; i < origSplits.size();) {
			try {
				i = addIndexedSplits                        (origSplits, i, newSplits, cfg);
			} catch (IOException | ProviderNotFoundException e) {
				if (cfg.getBoolean(ENABLE_BAI_SPLIT_CALCULATOR, false)) {
					try {
						i = addBAISplits            (origSplits, i, newSplits, cfg);
					} catch (IOException | ProviderNotFoundException e2) {
						i = addProbabilisticSplits  (origSplits, i, newSplits, cfg);
					}
				} else {
					i = addProbabilisticSplits          (origSplits, i, newSplits, cfg);
				}
			}
		}
		return filterByInterval(newSplits, cfg);
	}

	// Handles all the splits that share the Path of the one at index i,
	// returning the next index to be used.
	private int addIndexedSplits(
			List<InputSplit> splits, int i, List<InputSplit> newSplits,
			Configuration cfg)
		throws IOException
	{
		final Path file = ((FileSplit)splits.get(i)).getPath();
		List<InputSplit> potentialSplits = new ArrayList<InputSplit>();

		final SplittingBAMIndex idx = new SplittingBAMIndex(
			file.getFileSystem(cfg).open(getIdxPath(file)));

		int splitsEnd = splits.size();
		for (int j = i; j < splitsEnd; ++j)
			if (!file.equals(((FileSplit)splits.get(j)).getPath()))
				splitsEnd = j;

		if (idx.size() == 1) { // no alignments, only the file size, so no splits to add
			return splitsEnd;
		}

		for (int j = i; j < splitsEnd; ++j) {
			final FileSplit fileSplit = (FileSplit)splits.get(j);

			final long start =         fileSplit.getStart();
			final long end   = start + fileSplit.getLength();

			final Long blockStart = idx.nextAlignment(start);

			// The last split needs to end where the last alignment ends, but the
			// index doesn't store that data (whoops); we only know where the last
			// alignment begins. Fortunately there's no need to change the index
			// format for this: we can just set the end to the maximal length of
			// the final BGZF block (0xffff), and then read until BAMRecordCodec
			// hits EOF.
			Long blockEnd;
			if (j == splitsEnd - 1) {
				blockEnd = idx.prevAlignment(end) | 0xffff;
			} else {
				blockEnd = idx.nextAlignment(end);
			}

			if (blockStart == null || blockEnd == null) {
				logger.warn("Index for {} was not good. Generating probabilistic splits.", file);
				return addProbabilisticSplits(splits, i, newSplits, cfg);
			}

			potentialSplits.add(new FileVirtualSplit(
						file, blockStart, blockEnd, fileSplit.getLocations()));
		}

		for (InputSplit s : potentialSplits) {
			newSplits.add(s);
		}
		return splitsEnd;
	}

	// Handles all the splits that share the Path of the one at index i,
	// returning the next index to be used.
        private int addBAISplits(List<InputSplit> splits,
                                 int i,
                                 List<InputSplit> newSplits,
                                 Configuration conf) throws IOException {
                int splitsEnd = i;

                final Path path = ((FileSplit)splits.get(i)).getPath();
                final Path baiPath = getBAIPath(path);
                final FileSystem fs = path.getFileSystem(conf);
                final Path sinPath;
                if (fs.exists(baiPath)) {
                    sinPath = baiPath;
                } else {
                    sinPath = new Path(path.toString().replaceFirst("\\.bam$", BAMIndex.BAMIndexSuffix));
                }
                try (final FSDataInputStream in = fs.open(path);
                     final SeekableStream guesserSin = WrapSeekable.openPath(fs, path);
                     final SeekableStream sin = WrapSeekable.openPath(fs, sinPath)) {

                        SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
                        SAMSequenceDictionary dict = header.getSequenceDictionary();
                        
                        final BAMSplitGuesser guesser = new BAMSplitGuesser(guesserSin, conf);

                        final LinearBAMIndex idx = new LinearBAMIndex(sin, dict);

			// searches for the first contig that contains linear bins
			// a contig will have no linear bins if there are no reads mapped to that
			// contig (e.g., reads were aligned to a whole genome, and then reads from
			// only a single contig were selected)
                        int ctgIdx = -1;
                        int bin = 0;
                        LinearIndex linIdx;
                        int ctgBins;
                        long lastStart = 0;
                        do {
                                ctgIdx++;
                                linIdx = idx.getLinearIndex(ctgIdx);
                                ctgBins = linIdx.size();
                        } while(ctgBins == 0);
                        long nextStart = linIdx.get(bin);
                        
                        FileVirtualSplit newSplit = null;
                        boolean lastWasGuessed = false;

			// loop and process all of the splits that share a single .bai
                        while(splitsEnd < splits.size() &&
                              ((FileSplit)(splits.get(splitsEnd))).getPath() == path) {
                                FileSplit fSplit = (FileSplit)splits.get(splitsEnd);
                                splitsEnd++;
                                
                                if (splitsEnd >= splits.size()) {
                                        break;
                                }

                                long fSplitEnd = (fSplit.getStart() + fSplit.getLength()) << 16;
                                lastStart = nextStart;

				// we need to advance and find the first linear index bin
				// that starts after the current split ends.
				// this is the end of our split.
				while(nextStart < fSplitEnd && ctgIdx < dict.size()) {

					// are we going off of the end of this contig?
					// if so, advance to the next contig with a linear bin
					if (bin + 1 >= ctgBins) {
						do {
                                                        ctgIdx += 1;
                                                        bin = 0;
                                                        if (ctgIdx >= dict.size()) {
                                                                break;
                                                        }
                                                        linIdx = idx.getLinearIndex(ctgIdx);
                                                        ctgBins = linIdx.size();
                                                } while (ctgBins == 0);
                                        }
                                        if (ctgIdx < dict.size() && linIdx.size() > bin) {
                                                nextStart = linIdx.get(bin);
                                                bin++;
                                        }
                                }

				// is this the first split?
				// if so, split ranges from where the reads start until the identified end
                                if (fSplit.getStart() == 0) {
                                        try (final SeekableStream inFile = WrapSeekable.openPath(path.getFileSystem(conf), path)) {
                                            SamReader open = SamReaderFactory.makeDefault().setUseAsyncIo(false)
                                                .open(SamInputResource.of(inFile));
                                            SAMFileSpan span = open.indexing().getFilePointerSpanningReads();
                                            long bamStart = ((BAMFileSpan) span).getFirstOffset();
                                            newSplit = new FileVirtualSplit(fSplit.getPath(),
                                                                        bamStart,
                                                                        nextStart - 1,
                                                                        fSplit.getLocations());
                                            newSplits.add(newSplit);
                                        }
                                } else {

					// did we find any blocks that started in the last split?
					// if yes, then we're fine
					// if no, then we need to guess a split start (in the else clause)
					if (lastStart != nextStart) {
                                                if (lastWasGuessed) {
                                                        newSplit.setEndVirtualOffset(lastStart - 1);
                                                        lastWasGuessed = false;
                                                }
                                                newSplit = new FileVirtualSplit(fSplit.getPath(),
                                                                                lastStart,
                                                                                nextStart - 1,
                                                                                fSplit.getLocations());
                                                newSplits.add(newSplit);
                                        } else {
						// guess the start
                                                long alignedBeg = guesser.guessNextBAMRecordStart(fSplit.getStart(),
                                                                                                  fSplit.getStart() + fSplit.getLength());
                                                newSplit.setEndVirtualOffset(alignedBeg - 1);
                                                lastStart = alignedBeg;
                                                nextStart = alignedBeg;
                                                newSplit = new FileVirtualSplit(fSplit.getPath(),
                                                                                alignedBeg,
                                                                                alignedBeg + 1,
                                                                                fSplit.getLocations());
                                                lastWasGuessed = true;
                                                newSplits.add(newSplit);
                                        }
                                }
                                lastStart = nextStart;
                        }
			// clean up the last split
                        if (splitsEnd == splits.size()) {
                                if (lastWasGuessed) {
                                        newSplit.setEndVirtualOffset(lastStart - 1);
                                        lastWasGuessed = false;
                                }
                                FileSplit fSplit = (FileSplit)splits.get(splitsEnd - 1);
                                long fSplitEnd = (fSplit.getStart() + fSplit.getLength()) << 16;
                                newSplit = new FileVirtualSplit(fSplit.getPath(),
                                                                lastStart,
                                                                fSplitEnd,
                                                                fSplit.getLocations());
                                newSplits.add(newSplit);
                        }
                }
                return splitsEnd + 1;
        }
        
	// Works the same way as addIndexedSplits, to avoid having to reopen the
	// file repeatedly and checking addIndexedSplits for an index repeatedly.
	private int addProbabilisticSplits(
			List<InputSplit> splits, int i, List<InputSplit> newSplits,
			Configuration cfg)
		throws IOException
	{
		final Path path = ((FileSplit)splits.get(i)).getPath();
        try (final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(cfg), path)) {

            final BAMSplitGuesser guesser = new BAMSplitGuesser(sin, cfg);

            FileVirtualSplit previousSplit = null;

            for (; i < splits.size(); ++i) {
                FileSplit fspl = (FileSplit)splits.get(i);
                if (!fspl.getPath().equals(path))
                    break;

                long beg =       fspl.getStart();
                long end = beg + fspl.getLength();

                long alignedBeg = guesser.guessNextBAMRecordStart(beg, end);

                // As the guesser goes to the next BGZF block before looking for BAM
                // records, the ending BGZF blocks have to always be traversed fully.
                // Hence force the length to be 0xffff, the maximum possible.
                long alignedEnd = end << 16 | 0xffff;

                if (alignedBeg == end) {
                    // No records detected in this split: merge it to the previous one.
                    // This could legitimately happen e.g. if we have a split that is
                    // so small that it only contains the middle part of a BGZF block.
                    //
                    // Of course, if it's the first split, then this is simply not a
                    // valid BAM file.
                    //
                    // FIXME: In theory, any number of splits could only contain parts
                    // of the BAM header before we start to see splits that contain BAM
                    // records. For now, we require that the split size is at least as
                    // big as the header and don't handle that case.
                    if (previousSplit == null)
                        throw new IOException("'" + path + "': "+
                            "no reads in first split: bad BAM file or tiny split size?");

                    previousSplit.setEndVirtualOffset(alignedEnd);
                } else {
                    previousSplit = new FileVirtualSplit(
                                            path, alignedBeg, alignedEnd, fspl.getLocations());
                    if (logger.isDebugEnabled()) {
                        final long byteOffset  = alignedBeg >>> 16;
                        final long recordOffset = alignedBeg & 0xffff;
                        logger.debug(
                            "Split {}: byte offset: {} record offset: {}, virtual offset: {}",
                            i, byteOffset, recordOffset, alignedBeg);
                    }
                    newSplits.add(previousSplit);
                }
            }
        }
        return i;
	}

	private List<InputSplit> filterByInterval(List<InputSplit> splits, Configuration conf)
			throws IOException {
		if (!isBoundedTraversal(conf)) {
			return splits;
		}

		// Get the chunk lists (BAMFileSpans) in the intervals we want (chunks give start
		// and end file pointers into a BAM file) by looking in all the indexes for the BAM
		// files
		Set<Path> bamFiles = new LinkedHashSet<>();
		for (InputSplit split : splits) {
			bamFiles.add(((FileVirtualSplit) split).getPath());
		}
		Map<Path, BAMFileSpan> fileToSpan = new LinkedHashMap<>();
		SamReaderFactory readerFactory = SamReaderFactory.makeDefault()
				.setOption(SamReaderFactory.Option.CACHE_FILE_BASED_INDEXES, true)
				.setOption(SamReaderFactory.Option.EAGERLY_DECODE, false)
				.setUseAsyncIo(false);

		List<Interval> intervals = getIntervals(conf);

		Map<Path, Long> fileToUnmapped = new LinkedHashMap<>();
		boolean traverseUnplacedUnmapped = traverseUnplacedUnmapped(conf);

		for (Path bamFile : bamFiles) {
			FileSystem fs = bamFile.getFileSystem(conf);

			try (SamReader samReader =
							 readerFactory.open(NIOFileUtil.asPath(fs.makeQualified(bamFile).toUri()))) {
				if (!samReader.hasIndex()) {
					throw new IllegalArgumentException("Intervals set but no BAM index file found for " + bamFile);

				}

				try (FSDataInputStream in = fs.open(bamFile)) {
					SAMFileHeader header = SAMHeaderReader.readSAMHeaderFrom(in, conf);
					SAMSequenceDictionary dict = header.getSequenceDictionary();
					BAMIndex idx = samReader.indexing().getIndex();

					if (intervals != null && !intervals.isEmpty()) {
						QueryInterval[] queryIntervals = prepareQueryIntervals(intervals, dict);
						fileToSpan.put(bamFile, BAMFileReader.getFileSpan(queryIntervals, idx));
					}

					if (traverseUnplacedUnmapped) {
						long startOfLastLinearBin = idx.getStartOfLastLinearBin();
						long noCoordinateCount = ((AbstractBAMFileIndex) idx).getNoCoordinateCount();
						if (startOfLastLinearBin != -1 && noCoordinateCount > 0) {
							// add FileVirtualSplit (with no intervals) from startOfLastLinearBin to
							// end of file
							fileToUnmapped.put(bamFile, startOfLastLinearBin);
						}
					}
				}

			}
		}

		// Use the chunks to filter the splits
		List<InputSplit> filteredSplits = new ArrayList<>();
		for (InputSplit split : splits) {
			FileVirtualSplit virtualSplit = (FileVirtualSplit) split;
			long splitStart = virtualSplit.getStartVirtualOffset();
			long splitEnd = virtualSplit.getEndVirtualOffset();
			BAMFileSpan splitSpan = new BAMFileSpan(new Chunk(splitStart, splitEnd));
			BAMFileSpan span = fileToSpan.get(virtualSplit.getPath());
			if (span == null) {
				continue;
			}
			span = (BAMFileSpan) span.removeContentsBefore(splitSpan);
			span = (BAMFileSpan) span.removeContentsAfter(splitSpan);
			if (!span.getChunks().isEmpty()) {
				filteredSplits.add(new FileVirtualSplit(virtualSplit.getPath(), splitStart, splitEnd,
						virtualSplit.getLocations(), span.toCoordinateArray()));
			}
		}

		if (traverseUnplacedUnmapped) {
			// add extra splits that contain only unmapped reads
			for (Map.Entry<Path, Long> e : fileToUnmapped.entrySet()) {
				Path file = e.getKey();
				long unmappedStart = e.getValue();
				boolean foundFirstSplit = false;
				for (InputSplit split : splits) { // TODO: are splits in order of start position?
					FileVirtualSplit virtualSplit = (FileVirtualSplit) split;
					if (virtualSplit.getPath().equals(file)) {
						long splitStart = virtualSplit.getStartVirtualOffset();
						long splitEnd = virtualSplit.getEndVirtualOffset();
						if (foundFirstSplit) {
							filteredSplits.add(new FileVirtualSplit(virtualSplit.getPath(), splitStart, splitEnd,
									virtualSplit.getLocations()));
						} else if (splitStart <= unmappedStart && unmappedStart <= splitEnd) {
							filteredSplits.add(new FileVirtualSplit(virtualSplit.getPath(), unmappedStart, splitEnd,
									virtualSplit.getLocations()));
							foundFirstSplit = true;
						}
					}
				}
			}
		}

		return filteredSplits;
	}

	/**
	 * Converts a List of SimpleIntervals into the format required by the SamReader query API
	 * @param rawIntervals SimpleIntervals to be converted
	 * @return A sorted, merged list of QueryIntervals suitable for passing to the SamReader query API
	 */
	static QueryInterval[] prepareQueryIntervals( final List<Interval>
			rawIntervals, final SAMSequenceDictionary sequenceDictionary ) {
		if ( rawIntervals == null || rawIntervals.isEmpty() ) {
			return null;
		}

		// Convert each SimpleInterval to a QueryInterval
		final QueryInterval[] convertedIntervals =
				rawIntervals.stream()
						.map(rawInterval -> convertSimpleIntervalToQueryInterval(rawInterval, sequenceDictionary))
						.toArray(QueryInterval[]::new);

		// Intervals must be optimized (sorted and merged) in order to use the htsjdk query API
		return QueryInterval.optimizeIntervals(convertedIntervals);
	}
	/**
	 * Converts an interval in SimpleInterval format into an htsjdk QueryInterval.
	 *
	 * In doing so, a header lookup is performed to convert from contig name to index
	 *
	 * @param interval interval to convert
	 * @param sequenceDictionary sequence dictionary used to perform the conversion
	 * @return an equivalent interval in QueryInterval format
	 */
	private static QueryInterval convertSimpleIntervalToQueryInterval( final Interval interval,	final SAMSequenceDictionary sequenceDictionary ) {
		if (interval == null) {
			throw new IllegalArgumentException("interval may not be null");
		}
		if (sequenceDictionary == null) {
			throw new IllegalArgumentException("sequence dictionary may not be null");
		}

		final int contigIndex = sequenceDictionary.getSequenceIndex(interval.getContig());
		if ( contigIndex == -1 ) {
			throw new IllegalArgumentException("Contig " + interval.getContig() + " not present in reads sequence " +
					"dictionary");
		}

		return new QueryInterval(contigIndex, interval.getStart(), interval.getEnd());
	}

	@Override public boolean isSplitable(JobContext job, Path path) {
		return true;
	}
}