htsjdk.samtools.SAMTag Java Examples

The following examples show how to use htsjdk.samtools.SAMTag. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: DownsampleByDuplicateSetIntegrationTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private int countDuplicateSets(final ReadsDataSource readsDataSource){
    int count = 0;
    String currentMoleculeId = ""; // Note we are duplex aware: 12/A different from 12/B

    final Iterator<GATKRead> iterator = readsDataSource.iterator();
    while (iterator.hasNext()){
        final GATKRead read = iterator.next();
        final String moleculeID = read.getAttributeAsString(SAMTag.MI.name());
        if (!moleculeID.equals(currentMoleculeId)){
            count++;
            currentMoleculeId = moleculeID;
        }
    }

    return count;
}
 
Example #2
Source File: AddOriginalAlignmentTags.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
private static void addOATag(final GATKRead read) {
    String OAValue;
    if(!read.isUnmapped()){
        OAValue = String.format("%s,%s,%s,%s,%s,%s;",
                read.getContig().replace(OA_SEPARATOR, "_"),
                read.getStart(),
                read.isReverseStrand() ? "-" : "+",
                read.getCigar().toString(),
                read.getMappingQuality(),
                read.getAttributeAsString(SAMTag.NM.name()));
    } else {
        OAValue = "*,0,*,*,0,0;";
    }

    read.setAttribute(OA_TAG_NAME, OAValue);
}
 
Example #3
Source File: PSFilter.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
/**
 * Returns input read with alignment-related info cleared
 */
private static GATKRead clearReadAlignment(final GATKRead read, final SAMFileHeader header) {
    final GATKRead newRead = new SAMRecordToGATKReadAdapter(new SAMRecord(header));
    newRead.setName(read.getName());
    newRead.setBases(read.getBases());
    newRead.setBaseQualities(read.getBaseQualities());
    if (read.isReverseStrand()) {
        SequenceUtil.reverseComplement(newRead.getBases());
        SequenceUtil.reverseQualities(newRead.getBaseQualities());
    }
    newRead.setIsUnmapped();
    newRead.setIsPaired(read.isPaired());
    if (read.isPaired()) {
        newRead.setMateIsUnmapped();
        if (read.isFirstOfPair()) {
            newRead.setIsFirstOfPair();
        } else if (read.isSecondOfPair()) {
            newRead.setIsSecondOfPair();
        }
    }
    final String readGroup = read.getReadGroup();
    if (readGroup != null) {
        newRead.setAttribute(SAMTag.RG.name(), readGroup);
    }
    return newRead;
}
 
Example #4
Source File: ReadGroupBlackListReadFilter.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Override
public boolean test( final GATKRead read ) {
    final SAMReadGroupRecord readGroup = ReadUtils.getSAMReadGroupRecord(read, samHeader);
    if ( readGroup == null ) {
        return true;
    }

    for (final String attributeType : blacklistEntries.keySet()) {

        final String attribute;
        if (SAMReadGroupRecord.READ_GROUP_ID_TAG.equals(attributeType) || SAMTag.RG.name().equals(attributeType)) {
            attribute = readGroup.getId();
        } else {
            attribute = readGroup.getAttribute(attributeType);
        }
        if (attribute != null && blacklistEntries.get(attributeType).contains(attribute)) {
            return false;
        }
    }

    return true;
}
 
Example #5
Source File: MeanQualityHistogramGeneratorUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
@Test
public void testUsingOriginalQualities() throws Exception {
    final MeanQualityByCycleSpark.HistogramGenerator hg = new MeanQualityByCycleSpark.HistogramGenerator(true);
    Assert.assertEquals(hg.useOriginalQualities, true);

    GATKRead read1 = ArtificialReadUtils.createArtificialRead("aa".getBytes(), new byte[]{50, 50}, "2M");
    hg.addRead(read1);
    Assert.assertEquals(hg.firstReadCountsByCycle, new long[0]);
    assertEqualsDoubleArray(hg.firstReadTotalsByCycle, new double[0], 1e-05);
    Assert.assertEquals(hg.secondReadCountsByCycle, new long[0]);
    assertEqualsDoubleArray(hg.secondReadTotalsByCycle, new double[0], 1e-05);

    GATKRead read2 = ArtificialReadUtils.createArtificialRead("aa".getBytes(), new byte[]{50, 50}, "2M");
    read2.setAttribute(SAMTag.OQ.name(), SAMUtils.phredToFastq(new byte[]{30, 40}));
    hg.addRead(read2);

    Assert.assertEquals(hg.firstReadCountsByCycle, new long[]{0, 1, 1});
    assertEqualsDoubleArray(hg.firstReadTotalsByCycle, new double[]{0, 30, 40}, 1e-05);
    Assert.assertEquals(hg.secondReadCountsByCycle, new long[]{0, 0, 0});
    assertEqualsDoubleArray(hg.secondReadTotalsByCycle, new double[]{0, 0, 0}, 1e-05);
}
 
Example #6
Source File: PreprocessingTools.java    From halvade with GNU General Public License v3.0 6 votes vote down vote up
public int callElPrep(String input, String output, String rg, int threads, 
        SAMRecordIterator SAMit,
        SAMFileHeader header, String dictFile, boolean updateRG, boolean keepDups, String RGID) throws InterruptedException, QualityException {
    
    SAMRecord sam;
    SAMFileWriterFactory factory = new SAMFileWriterFactory();
    SAMFileWriter Swriter = factory.makeSAMWriter(header, true, new File(input));
    
    int reads = 0;
    while(SAMit.hasNext()) {
        sam = SAMit.next();
        if(updateRG)
            sam.setAttribute(SAMTag.RG.name(), RGID);
        Swriter.addAlignment(sam);
        reads++;
    }
    Swriter.close();
    
    String customArgs = HalvadeConf.getCustomArgs(context.getConfiguration(), "elprep", "");  
    String[] command = CommandGenerator.elPrep(bin, input, output, threads, true, rg, null, !keepDups, customArgs);
    long estimatedTime = runProcessAndWait("elPrep", command);
    if(context != null)
        context.getCounter(HalvadeCounters.TIME_ELPREP).increment(estimatedTime);
    
    return reads;
}
 
Example #7
Source File: CollectJumpingLibraryMetrics.java    From picard with MIT License 5 votes vote down vote up
/**
 * Calculates the mode for outward-facing pairs, using the first SAMPLE_FOR_MODE
 * outward-facing pairs found in INPUT
 */
private double getOutieMode() {

    int samplePerFile = SAMPLE_FOR_MODE / INPUT.size();

    Histogram<Integer> histo = new Histogram<Integer>();

    for (File f : INPUT) {
        SamReader reader = SamReaderFactory.makeDefault().open(f);
        int sampled = 0;
        for (Iterator<SAMRecord> it = reader.iterator(); it.hasNext() && sampled < samplePerFile; ) {
            SAMRecord sam = it.next();
            if (!sam.getFirstOfPairFlag()) {
                continue;
            }
            // If we get here we've hit the end of the aligned reads
            if (sam.getReadUnmappedFlag() && sam.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX) {
                break;
            } else if (sam.getReadUnmappedFlag() || sam.getMateUnmappedFlag()) {
                continue;
            } else if ((sam.getAttribute(SAMTag.MQ.name()) == null ||
                    sam.getIntegerAttribute(SAMTag.MQ.name()) >= MINIMUM_MAPPING_QUALITY) &&
                    sam.getMappingQuality() >= MINIMUM_MAPPING_QUALITY &&
                    sam.getMateNegativeStrandFlag() != sam.getReadNegativeStrandFlag() &&
                    sam.getMateReferenceIndex().equals(sam.getReferenceIndex()) &&
                    SamPairUtil.getPairOrientation(sam) == PairOrientation.RF) {
                histo.increment(Math.abs(sam.getInferredInsertSize()));
                sampled++;
            }
        }
        CloserUtil.close(reader);
    }

    return histo.size() > 0 ? histo.getMode() : 0;
}
 
Example #8
Source File: HitsForInsert.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public int compare(final SAMRecord rec1, final SAMRecord rec2) {
    final Integer hi1 = rec1.getIntegerAttribute(SAMTag.HI.name());
    final Integer hi2 = rec2.getIntegerAttribute(SAMTag.HI.name());
    if (hi1 == null) {
        if (hi2 == null) return 0;
        else return 1;
    } else if (hi2 == null) {
        return -1;
    } else {
        return hi1.compareTo(hi2);
    }
}
 
Example #9
Source File: ReadQueryNameComparator.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Override
public int compare(final GATKRead read1, final GATKRead read2) {
    int cmp = compareReadNames(read1, read2);
    if (cmp != 0) {
        return cmp;
    }

    final boolean r1Paired = read1.isPaired();
    final boolean r2Paired = read2.isPaired();

    if (r1Paired || r2Paired) {
        if (!r1Paired) return 1;
        else if (!r2Paired) return -1;
        else if (read1.isFirstOfPair()  && read2.isSecondOfPair()) return -1;
        else if (read1.isSecondOfPair() && read2.isFirstOfPair()) return 1;
    }

    if (read1.isReverseStrand() != read2.isReverseStrand()) {
        return (read1.isReverseStrand()? 1: -1);
    }
    if (read1.isSecondaryAlignment() != read2.isSecondaryAlignment()) {
        return read2.isSecondaryAlignment()? -1: 1;
    }
    if (read1.isSupplementaryAlignment() != read2.isSupplementaryAlignment()) {
        return read2.isSupplementaryAlignment() ? -1 : 1;
    }
    final Integer hitIndex1 = read1.getAttributeAsInteger(SAMTag.HI.name());
    final Integer hitIndex2 = read2.getAttributeAsInteger(SAMTag.HI.name());
    if (hitIndex1 != null) {
        if (hitIndex2 == null) return 1;
        else {
            cmp = hitIndex1.compareTo(hitIndex2);
            if (cmp != 0) return cmp;
        }
    } else if (hitIndex2 != null) return -1;
    return 0;
}
 
Example #10
Source File: HaplotypeBAMWriter.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/**
 * Write out a representation of this haplotype as a read
 *
 * @param haplotype a haplotype to write out, must not be null
 * @param paddedRefLoc the reference location, must not be null
 * @param isAmongBestHaplotypes true if among the best haplotypes, false if it was just one possible haplotype
 * @param callableRegion the region over which variants are being called
 */
private void writeHaplotype(final Haplotype haplotype,
                            final Locatable paddedRefLoc,
                            final boolean isAmongBestHaplotypes,
                            final Locatable callableRegion) {
    Utils.nonNull(haplotype, "haplotype cannot be null");
    Utils.nonNull(paddedRefLoc, "paddedRefLoc cannot be null");

    final SAMRecord record = new SAMRecord(output.getBAMOutputHeader());
    record.setReadBases(haplotype.getBases());
    record.setAlignmentStart(paddedRefLoc.getStart() + haplotype.getAlignmentStartHapwrtRef());
    // Use a base quality value "!" for it's display value (quality value is not meaningful)
    record.setBaseQualities(Utils.dupBytes((byte) '!', haplotype.getBases().length));
    record.setCigar(haplotype.getCigar());
    record.setMappingQuality(isAmongBestHaplotypes ? bestHaplotypeMQ : otherMQ);
    record.setReadName(output.getHaplotypeSampleTag() + uniqueNameCounter++);
    record.setAttribute(output.getHaplotypeSampleTag(), haplotype.hashCode());
    record.setReadUnmappedFlag(false);
    record.setReferenceIndex(output.getBAMOutputHeader().getSequenceIndex(paddedRefLoc.getContig()));
    record.setAttribute(SAMTag.RG.toString(), output.getHaplotypeReadGroupID());
    record.setFlags(SAMFlag.READ_REVERSE_STRAND.intValue());
    if (callableRegion != null) {
        record.setAttribute(AssemblyBasedCallerUtils.CALLABLE_REGION_TAG, callableRegion.toString());
    }

    output.add(new SAMRecordToGATKReadAdapter(record));
}
 
Example #11
Source File: PlatformUnitReadFilter.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
private String getPlatformUnit( final GATKRead read ) {
    final String pu_attr = read.getAttributeAsString(SAMTag.PU.name());
    if ( pu_attr != null ) {
        return pu_attr;
    }

    return ReadUtils.getPlatformUnit(read, samHeader);
}
 
Example #12
Source File: HitsForInsert.java    From picard with MIT License 5 votes vote down vote up
public int compare(final SAMRecord rec1, final SAMRecord rec2) {
    final Integer hi1 = rec1.getIntegerAttribute(SAMTag.HI.name());
    final Integer hi2 = rec2.getIntegerAttribute(SAMTag.HI.name());
    if (hi1 == null) {
        if (hi2 == null) return 0;
        else return 1;
    } else if (hi2 == null) {
        return -1;
    } else {
        return hi1.compareTo(hi2);
    }
}
 
Example #13
Source File: DuplicateSetWalker.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
/***
 * FGBio GroupByUMI returns reads sorted by molecule ID: For example, the input bam may look like
 * read1: ... MI:Z:0/A ...
 * read2: ... MI:Z:0/A ...
 * read3: ... MI:Z:0/B ...
 * read4: ... MI:Z:0/B ...
 * read5: ... MI:Z:1/A ...
 * read6: ... MI:Z:1/B ...
 * read7: ... MI:Z:1/B ...
 *
 * Thus it's sufficient to go through the reads in order and collect them in a list until
 * we encounter the next molecule ID, at which point we pass the list to the {@code apply} method,
 * process the set based on the child class's implementation of the method, and clear the {@code currentDuplicateSet} variable and start collecting reads again.
 *
 * Notice there are two apply() methods in this class:
 * This apply() inherited from ReadWalker is marked final to discourage subclassing.
 * A subclass must override the other apply() method that takes in the DuplicateSet.
 */
@Override
public final void apply(GATKRead read, ReferenceContext referenceContext, FeatureContext featureContext) {
    if (currentReadsWithSameUMI == null){ // evaluates to true for the very first read
        currentReadsWithSameUMI = new ReadsWithSameUMI(read);
        return;
    }

    final int readMoleculeNumber = MoleculeID.getMoleculeNumberOfRead(read);
    final int duplicateSetMoleculeNumber = currentReadsWithSameUMI.getMoleculeNumber();

    // If the incoming read has the molecule id less than that of the currentDuplicateSet,
    // the input bam is not sorted properly by the MI tag
    if (duplicateSetMoleculeNumber > readMoleculeNumber){
        throw new UserException(String.format("The input bam must be sorted by the molecule ID (%s) tag.", SAMTag.MI.name()));
    }

    if (duplicateSetMoleculeNumber < readMoleculeNumber) {
        // The incoming read's molecule ID is greater than that of the current set, meaning we've reached the end of the current set.
        if (rejectSet(currentReadsWithSameUMI)){
            currentReadsWithSameUMI = new ReadsWithSameUMI(read);
            return;
        }

        // Call the apply() method to process the current set and start a new set.
        apply(currentReadsWithSameUMI,
                new ReferenceContext(reference, currentReadsWithSameUMI.getInterval()), // Will create an empty ReferenceContext if reference or readInterval == null
                new FeatureContext(features, currentReadsWithSameUMI.getInterval()));
        currentReadsWithSameUMI = new ReadsWithSameUMI(read);
        return;
    }

    // The incoming read has the same UMI as the current set; simply add to the current set
    currentReadsWithSameUMI.addRead(read);
}
 
Example #14
Source File: ReadQueryNameComparatorUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@DataProvider
public Iterator<Object[]> getReads(){
    final GATKRead differentName = getRead(NAME+NAME);

    final GATKRead unpaired = getRead(NAME);
    unpaired.setIsPaired(false);

    final GATKRead paired = getRead(NAME);
    paired.setIsPaired(true);

    final GATKRead firstOfPair = getRead(NAME);
    firstOfPair.setIsFirstOfPair();

    final GATKRead secondOfPair = getRead(NAME);
    secondOfPair.setIsSecondOfPair();

    final GATKRead reverseStrand = getRead(NAME);
    reverseStrand.setIsReverseStrand(true);

    final GATKRead supplementary = getRead(NAME);
    supplementary.setIsSupplementaryAlignment(true);

    final GATKRead secondary = getRead(NAME);
    secondary.setIsSecondaryAlignment(true);

    final GATKRead tagHI1 = getRead(NAME);
    tagHI1.setAttribute(SAMTag.HI.name(), 1);

    final GATKRead tagHI2 = getRead(NAME);
    tagHI2.setAttribute(SAMTag.HI.name(), 2);

    List<GATKRead> reads = Arrays.asList(differentName, unpaired, paired, firstOfPair, secondOfPair, reverseStrand, supplementary, secondary, tagHI1, tagHI2);
    List<Object[]> tests = new ArrayList<>();
    for(GATKRead left: reads){
        for(GATKRead right: reads){
            tests.add(new Object[]{left, right});
        }
    };
    return tests.iterator();
}
 
Example #15
Source File: AddOATagTest.java    From picard with MIT License 5 votes vote down vote up
private static void validateOATag(final File testSam, final File truthSam) throws IOException {
    final ArrayList<String> truthOAValues = new ArrayList<>();
    final ArrayList<String> testOAValues = new ArrayList<>();

    try(SamReader readerPre = SamReaderFactory.makeDefault().open(truthSam)) {
        readerPre.forEach(rec -> truthOAValues.add(rec.getStringAttribute(SAMTag.OA.name())));
    }

    try (SamReader readerPost = SamReaderFactory.makeDefault().open(testSam)) {
        readerPost.forEach(rec -> testOAValues.add(rec.getStringAttribute(SAMTag.OA.name())));
    }

    Assert.assertEquals(testOAValues, truthOAValues);
}
 
Example #16
Source File: ReadsWithSameUMIUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
public static List<GATKRead> makeReads(final int numReadPairs, final String contig, final int moleculeNumber,
                                       final String readName){
    final int readLength = 146;
    final List<GATKRead> reads = new ArrayList<>(numReadPairs);
    final List<GATKRead> mates = new ArrayList<>(numReadPairs);

    for (int i = 0; i < numReadPairs; i++){
        // Forward read
        final int readStart = 3000;
        final String readStrand = i % 2 == 0 ? "A" : "B";
        final MoleculeID id = new MoleculeID(moleculeNumber, readStrand);
        final GATKRead read = ArtificialReadUtils.createSamBackedRead(readName + i, contig, readStart, readLength);
        read.setAttribute(SAMTag.MI.name(), id.getSAMField());
        read.setIsReverseStrand(false);

        // Reverse read
        final int insertSize = 1000;
        final int mateStart = readStart + insertSize - readLength;
        final GATKRead mate = ArtificialReadUtils.createSamBackedRead(readName + "_mate" + i, contig, mateStart, readLength);
        final MoleculeID mateId = new MoleculeID(moleculeNumber, readStrand == "A" ? "B" : "A");
        mate.setAttribute(SAMTag.MI.name(), mateId.getSAMField());
        mate.setIsReverseStrand(true);

        if (readStrand.equals("A")){
            read.setIsFirstOfPair(); // F1R2
            mate.setIsSecondOfPair();
        } else {
            read.setIsSecondOfPair(); // F2R1
            mate.setIsFirstOfPair();
        }

        reads.add(read);
        mates.add(mate);
    }

    final List<GATKRead> results = new ArrayList<>(numReadPairs*2);
    results.addAll(reads);
    results.addAll(mates); // Add reads and mates in order so that the resulting list is ordered by coordinate
    return results;
}
 
Example #17
Source File: MoleculeIDUnitTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test
public void testBasicOperations(){
    final int moleculeNumber = 131;
    final String strand = "A";
    final String samField = moleculeNumber + "/" + strand;
    final GATKRead read = ArtificialReadUtils.createSamBackedRead( "read", "3", 1000, 1151);
    read.setAttribute(SAMTag.MI.name(), samField);
    final MoleculeID id = new MoleculeID(read);
    Assert.assertEquals(id.getMoleculeNumber(), moleculeNumber);
    Assert.assertEquals(id.getStrand(), strand);
    Assert.assertEquals(id.getSAMField(), samField);
}
 
Example #18
Source File: MergeBamAlignmentTest.java    From picard with MIT License 5 votes vote down vote up
private SAMRecord makeRead(final SAMFileHeader alignedHeader, final SAMRecord unmappedRec, final HitSpec hitSpec, final int hitIndex) {
    final SAMRecord rec = new SAMRecord(alignedHeader);
    rec.setReadName(unmappedRec.getReadName());
    rec.setReadBases(unmappedRec.getReadBases());
    rec.setBaseQualities(unmappedRec.getBaseQualities());
    rec.setMappingQuality(hitSpec.mapq);
    if (!hitSpec.primary) rec.setNotPrimaryAlignmentFlag(true);
    final Cigar cigar = new Cigar();
    final int readLength = rec.getReadLength();
    if (hitSpec.filtered) {
        // Add two insertions so alignment is filtered.
        cigar.add(new CigarElement(readLength-4, CigarOperator.M));
        cigar.add(new CigarElement(1, CigarOperator.I));
        cigar.add(new CigarElement(1, CigarOperator.M));
        cigar.add(new CigarElement(1, CigarOperator.I));
        cigar.add(new CigarElement(1, CigarOperator.M));
    } else {
        cigar.add(new CigarElement(readLength, CigarOperator.M));
    }
    rec.setCigar(cigar);

    rec.setReferenceName(bigSequenceName);
    rec.setAttribute(SAMTag.HI.name(), hitIndex);
    rec.setAlignmentStart(hitIndex + 1);

    return rec;
}
 
Example #19
Source File: QualityScoreDistributionSparkIntegrationTest.java    From gatk with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
@Test(groups = {"R", "spark"})
public void testAccumulators() throws Exception {
    final long[] qs= new long[128];
    final long[] oqs= new long[128];

    final Counts counts = new Counts(false);
    final long[] initQs = counts.getQualCounts();
    final long[] initOQs = counts.getOrigQualCounts();
    Assert.assertEquals(initQs, qs);
    Assert.assertEquals(initOQs, oqs);

    final GATKRead read1 = ArtificialReadUtils.createArtificialRead("aa".getBytes(), new byte[]{50, 50}, "2M");
    read1.setAttribute(SAMTag.OQ.name(), SAMUtils.phredToFastq(new byte[]{30, 40}));
    counts.addRead(read1);

    qs[50]+=2;//there are two bases in the read with a quality score of 50
    oqs[30]+=1;
    oqs[40]+=1;
    final long[] oneQs = counts.getQualCounts();
    final long[] oneOQs = counts.getOrigQualCounts();
    Assert.assertEquals(oneQs, qs);
    Assert.assertEquals(oneOQs, oqs);

    final Counts counts2 = new Counts(false);

    final GATKRead read2 = ArtificialReadUtils.createArtificialRead("aa".getBytes(), new byte[]{51, 51}, "2M");
    read2.setAttribute(SAMTag.OQ.name(), SAMUtils.phredToFastq(new byte[]{31, 41}));
    counts2.addRead(read2);

    counts.merge(counts2);
    qs[51]+=2;//there are two bases in the read with a quality score of 51
    oqs[31]+=1;  //new read OQ
    oqs[41]+=1;  //new read OQ
    final long[] mergedQs = counts.getQualCounts();
    final long[] mergedOQs = counts.getOrigQualCounts();
    Assert.assertEquals(mergedQs, qs);
    Assert.assertEquals(mergedOQs, oqs);
}
 
Example #20
Source File: BamMergeReducer.java    From halvade with GNU General Public License v3.0 5 votes vote down vote up
@Override
protected void reduce(ChromosomeRegion key, Iterable<SAMRecordWritable> values, Context context) throws IOException, InterruptedException {
    Iterator<SAMRecordWritable> it = values.iterator();
    SAMRecord sam = null;
    while(it.hasNext()) {
        sam = it.next().get();
        sam.setAttribute(SAMTag.RG.name(), RGID);
        samWritable.set(sam);
        recordWriter.write(outKey, samWritable);
                
    }
}
 
Example #21
Source File: PreprocessingTools.java    From halvade with GNU General Public License v3.0 5 votes vote down vote up
public int streamElPrep(Reducer.Context context, String output, String rg, 
            int threads, SAMRecordIterator SAMit, 
            SAMFileHeader header, String dictFile, boolean updateRG, boolean keepDups, String RGID) throws InterruptedException, IOException, QualityException {
        long startTime = System.currentTimeMillis();
        String customArgs = HalvadeConf.getCustomArgs(context.getConfiguration(), "elprep", "");  
        String[] command = CommandGenerator.elPrep(bin, "/dev/stdin", output, threads, true, rg, null, !keepDups, customArgs);
//        runProcessAndWait(command);
        ProcessBuilderWrapper builder = new ProcessBuilderWrapper(command, null);
        builder.startProcess(true);        
        BufferedWriter localWriter = builder.getSTDINWriter();
        
        // write header
        final StringWriter headerTextBuffer = new StringWriter();
        new SAMTextHeaderCodec().encode(headerTextBuffer, header);
        final String headerText = headerTextBuffer.toString();
        localWriter.write(headerText, 0, headerText.length());
        
        
        SAMRecord sam;
        int reads = 0;
        while(SAMit.hasNext()) {
            sam = SAMit.next();
            if(updateRG)
                sam.setAttribute(SAMTag.RG.name(), RGID);
            String samString = sam.getSAMString();
            localWriter.write(samString, 0, samString.length());
            reads++;
        }
        localWriter.flush();
        localWriter.close();
                
        int error = builder.waitForCompletion();
        if(error != 0)
            throw new ProcessException("elPrep", error);
        long estimatedTime = System.currentTimeMillis() - startTime;
        Logger.DEBUG("estimated time: " + estimatedTime / 1000);
        if(context != null)
            context.getCounter(HalvadeCounters.TIME_ELPREP).increment(estimatedTime);
        return reads;
    }
 
Example #22
Source File: AlignmentsTags.java    From cramtools with Apache License 2.0 4 votes vote down vote up
/**
 * Same as {@link htsjdk.samtools.util.SequenceUtil#calculateMdAndNmTags}
 * but allows for reference excerpt instead of a full reference sequence.
 * 
 * @param record
 *            SAMRecord to inject NM and MD tags into
 * @param ref
 *            reference sequence bases, may be a subsequence starting at
 *            refOffest
 * @param refOffset
 *            array offset of the reference bases, 0 is the same as the
 *            whole sequence. This value will be subtracted from the
 *            reference array index in calculations.
 * @param calcMD
 *            calculate MD tag if true
 * @param calcNM
 *            calculate NM tag if true
 */
public static void calculateMdAndNmTags(final SAMRecord record, final byte[] ref, final int refOffset,
		final boolean calcMD, final boolean calcNM) {
	if (!calcMD && !calcNM)
		return;

	final Cigar cigar = record.getCigar();
	final List<CigarElement> cigarElements = cigar.getCigarElements();
	final byte[] seq = record.getReadBases();
	final int start = record.getAlignmentStart() - 1;
	int i, x, y, u = 0;
	int nm = 0;
	final StringBuilder str = new StringBuilder();

	final int size = cigarElements.size();
	for (i = y = 0, x = start; i < size; ++i) {
		final CigarElement ce = cigarElements.get(i);
		int j;
		final int length = ce.getLength();
		final CigarOperator op = ce.getOperator();
		if (op == CigarOperator.MATCH_OR_MISMATCH || op == CigarOperator.EQ || op == CigarOperator.X) {
			for (j = 0; j < length; ++j) {
				final int z = y + j;

				if (refOffset + ref.length <= x + j)
					break; // out of boundary

				int c1 = 0;
				int c2 = 0;
				// try {
				c1 = seq[z];
				c2 = ref[x + j - refOffset];

				if ((c1 == c2) || c1 == 0) {
					// a match
					++u;
				} else {
					str.append(u);
					str.appendCodePoint(ref[x + j - refOffset]);
					u = 0;
					++nm;
				}
			}
			if (j < length)
				break;
			x += length;
			y += length;
		} else if (op == CigarOperator.DELETION) {
			str.append(u);
			str.append('^');
			for (j = 0; j < length; ++j) {
				if (ref[x + j - refOffset] == 0)
					break;
				str.appendCodePoint(ref[x + j - refOffset]);
			}
			u = 0;
			if (j < length)
				break;
			x += length;
			nm += length;
		} else if (op == CigarOperator.INSERTION || op == CigarOperator.SOFT_CLIP) {
			y += length;
			if (op == CigarOperator.INSERTION)
				nm += length;
		} else if (op == CigarOperator.SKIPPED_REGION) {
			x += length;
		}
	}
	str.append(u);

	if (calcMD)
		record.setAttribute(SAMTag.MD.name(), str.toString());
	if (calcNM)
		record.setAttribute(SAMTag.NM.name(), nm);
}
 
Example #23
Source File: Utils.java    From cramtools with Apache License 2.0 4 votes vote down vote up
/**
 * Copied from net.sf.picard.sam.SamPairUtil. This is a more permissive
 * version of the method, which does not reset alignment start and reference
 * for unmapped reads.
 * 
 * @param rec1
 * @param rec2
 * @param header
 */
public static void setLooseMateInfo(final SAMRecord rec1, final SAMRecord rec2, final SAMFileHeader header) {
	if (rec1.getReferenceName() != SAMRecord.NO_ALIGNMENT_REFERENCE_NAME
			&& rec1.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX)
		rec1.setReferenceIndex(header.getSequenceIndex(rec1.getReferenceName()));
	if (rec2.getReferenceName() != SAMRecord.NO_ALIGNMENT_REFERENCE_NAME
			&& rec2.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX)
		rec2.setReferenceIndex(header.getSequenceIndex(rec2.getReferenceName()));

	// If neither read is unmapped just set their mate info
	if (!rec1.getReadUnmappedFlag() && !rec2.getReadUnmappedFlag()) {

		rec1.setMateReferenceIndex(rec2.getReferenceIndex());
		rec1.setMateAlignmentStart(rec2.getAlignmentStart());
		rec1.setMateNegativeStrandFlag(rec2.getReadNegativeStrandFlag());
		rec1.setMateUnmappedFlag(false);
		rec1.setAttribute(SAMTag.MQ.name(), rec2.getMappingQuality());

		rec2.setMateReferenceIndex(rec1.getReferenceIndex());
		rec2.setMateAlignmentStart(rec1.getAlignmentStart());
		rec2.setMateNegativeStrandFlag(rec1.getReadNegativeStrandFlag());
		rec2.setMateUnmappedFlag(false);
		rec2.setAttribute(SAMTag.MQ.name(), rec1.getMappingQuality());
	}
	// Else if they're both unmapped set that straight
	else if (rec1.getReadUnmappedFlag() && rec2.getReadUnmappedFlag()) {
		rec1.setMateNegativeStrandFlag(rec2.getReadNegativeStrandFlag());
		rec1.setMateUnmappedFlag(true);
		rec1.setAttribute(SAMTag.MQ.name(), null);
		rec1.setInferredInsertSize(0);

		rec2.setMateNegativeStrandFlag(rec1.getReadNegativeStrandFlag());
		rec2.setMateUnmappedFlag(true);
		rec2.setAttribute(SAMTag.MQ.name(), null);
		rec2.setInferredInsertSize(0);
	}
	// And if only one is mapped copy it's coordinate information to the
	// mate
	else {
		final SAMRecord mapped = rec1.getReadUnmappedFlag() ? rec2 : rec1;
		final SAMRecord unmapped = rec1.getReadUnmappedFlag() ? rec1 : rec2;

		mapped.setMateReferenceIndex(unmapped.getReferenceIndex());
		mapped.setMateAlignmentStart(unmapped.getAlignmentStart());
		mapped.setMateNegativeStrandFlag(unmapped.getReadNegativeStrandFlag());
		mapped.setMateUnmappedFlag(true);
		mapped.setInferredInsertSize(0);

		unmapped.setMateReferenceIndex(mapped.getReferenceIndex());
		unmapped.setMateAlignmentStart(mapped.getAlignmentStart());
		unmapped.setMateNegativeStrandFlag(mapped.getReadNegativeStrandFlag());
		unmapped.setMateUnmappedFlag(false);
		unmapped.setInferredInsertSize(0);
	}

	boolean firstIsFirst = rec1.getAlignmentStart() < rec2.getAlignmentStart();
	int insertSize = firstIsFirst ? SamPairUtil.computeInsertSize(rec1, rec2) : SamPairUtil.computeInsertSize(rec2,
			rec1);

	rec1.setInferredInsertSize(firstIsFirst ? insertSize : -insertSize);
	rec2.setInferredInsertSize(firstIsFirst ? -insertSize : insertSize);

}
 
Example #24
Source File: Evidence2SAM.java    From cramtools with Apache License 2.0 4 votes vote down vote up
public static void main(String[] args) throws IOException, ParseException {
	EvidenceRecordFileIterator iterator = new EvidenceRecordFileIterator(new File(args[0]));
	Read context = new Read();
	SAMFileHeader header = new SAMFileHeader();
	header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
	SAMSequenceRecord samSequenceRecord = new SAMSequenceRecord("chr10", 135534747);
	samSequenceRecord.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, iterator.assembly_ID);
	String readGroup = String.format("%s-%s", iterator.assembly_ID, iterator.chromosome);
	SAMReadGroupRecord readGroupRecord = new SAMReadGroupRecord(readGroup);
	readGroupRecord.setAttribute(SAMReadGroupRecord.READ_GROUP_SAMPLE_TAG, iterator.sample);
	readGroupRecord.setAttribute(SAMReadGroupRecord.PLATFORM_UNIT_TAG, readGroup);
	readGroupRecord.setAttribute(SAMReadGroupRecord.SEQUENCING_CENTER_TAG, "\"Complete Genomics\"");
	Date date = new SimpleDateFormat("yyyy-MMM-dd hh:mm:ss.S").parse(iterator.generatedAt);
	readGroupRecord.setAttribute(SAMReadGroupRecord.DATE_RUN_PRODUCED_TAG,
			new SimpleDateFormat("yyyy-MM-dd").format(date));
	readGroupRecord.setAttribute(SAMReadGroupRecord.PLATFORM_TAG, "\"Complete Genomics\"");
	header.addReadGroup(readGroupRecord);

	header.addSequence(samSequenceRecord);

	SAMFileWriterFactory f = new SAMFileWriterFactory();
	SAMFileWriter samWriter;
	if (args.length > 1)
		samWriter = f.makeBAMWriter(header, false, new File(args[1]));
	else
		samWriter = f.makeSAMWriter(header, false, System.out);
	int i = 0;
	long time = System.currentTimeMillis();
	DedupIterator dedupIt = new DedupIterator(iterator);

	while (dedupIt.hasNext()) {
		EvidenceRecord evidenceRecord = dedupIt.next();
		if (evidenceRecord == null)
			throw new RuntimeException();
		try {
			context.reset(evidenceRecord);
			context.parse();
		} catch (Exception e) {
			System.err.println("Failed on line:");
			System.err.println(evidenceRecord.line);
			throw new RuntimeException(e);
		}

		SAMRecord[] samRecords = context.toSAMRecord(header);
		for (SAMRecord samRecord : samRecords) {
			samRecord.setAttribute(SAMTag.RG.name(), readGroup);
			samWriter.addAlignment(samRecord);
		}

		i++;
		if (i % 1000 == 0) {
			if (System.currentTimeMillis() - time > 10 * 1000) {
				time = System.currentTimeMillis();
				System.err.println(i);
			}
		}

		if (i > 10000)
			break;
	}
	samWriter.close();
}
 
Example #25
Source File: Utils.java    From cramtools with Apache License 2.0 4 votes vote down vote up
/**
 * A rip off samtools bam_md.c
 * 
 * @param record
 * @param ref
 * @param calcMD
 * @param calcNM
 */
public static void calculateMdAndNmTags(SAMRecord record, byte[] ref, boolean calcMD, boolean calcNM) {
	if (!calcMD && !calcNM)
		return;

	Cigar cigar = record.getCigar();
	List<CigarElement> cigarElements = cigar.getCigarElements();
	byte[] seq = record.getReadBases();
	int start = record.getAlignmentStart() - 1;
	int i, x, y, u = 0;
	int nm = 0;
	StringBuffer str = new StringBuffer();

	int size = cigarElements.size();
	for (i = y = 0, x = start; i < size; ++i) {
		CigarElement ce = cigarElements.get(i);
		int j, l = ce.getLength();
		CigarOperator op = ce.getOperator();
		if (op == CigarOperator.MATCH_OR_MISMATCH || op == CigarOperator.EQ || op == CigarOperator.X) {
			for (j = 0; j < l; ++j) {
				int z = y + j;

				if (ref.length <= x + j)
					break; // out of boundary

				int c1 = 0;
				int c2 = 0;
				// try {
				c1 = seq[z];
				c2 = ref[x + j];

				if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) {
					// a match
					++u;
				} else {
					str.append(u);
					str.appendCodePoint(ref[x + j]);
					u = 0;
					++nm;
				}
			}
			if (j < l)
				break;
			x += l;
			y += l;
		} else if (op == CigarOperator.DELETION) {
			str.append(u);
			str.append('^');
			for (j = 0; j < l; ++j) {
				if (ref[x + j] == 0)
					break;
				str.appendCodePoint(ref[x + j]);
			}
			u = 0;
			if (j < l)
				break;
			x += l;
			nm += l;
		} else if (op == CigarOperator.INSERTION || op == CigarOperator.SOFT_CLIP) {
			y += l;
			if (op == CigarOperator.INSERTION)
				nm += l;
		} else if (op == CigarOperator.SKIPPED_REGION) {
			x += l;
		}
	}
	str.append(u);

	if (calcMD)
		record.setAttribute(SAMTag.MD.name(), str.toString());
	if (calcNM)
		record.setAttribute(SAMTag.NM.name(), nm);
}
 
Example #26
Source File: UMIIterator.java    From Drop-seq with MIT License 4 votes vote down vote up
public CellBarcodeRecorder(Iterator<SAMRecord> underlyingIterator) {
	this.underlyingIterator = underlyingIterator;
	cellBarcodeBinaryTag = SAMTag.makeBinaryTag(cellBarcodeTag);
}
 
Example #27
Source File: MoleculeID.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/** Extracts the strand portion of the {@link SAMTag.MI} field of the read **/
public static String getStrandOfRead(final GATKRead read){
    final String MITag = read.getAttributeAsString(SAMTag.MI.name());
    return MITag.split(ReadsWithSameUMI.FGBIO_MI_TAG_DELIMITER)[1];
}
 
Example #28
Source File: MoleculeID.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/** Extracts the molecule number portion of the {@link SAMTag.MI} field of the read **/
public static int getMoleculeNumberOfRead(final GATKRead read){
    final String MITag = read.getAttributeAsString(SAMTag.MI.name());
    return Integer.parseInt(MITag.split(ReadsWithSameUMI.FGBIO_MI_TAG_DELIMITER)[0]);
}
 
Example #29
Source File: HitsForInsert.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Some alignment strategies expect to receive alignments for ends that are coordinated by
 * hit index (HI) tag.  This method lines up alignments for each end by HI tag value, and if there is
 * no corresponding alignment for an alignment, there is a null in the array at that slot.
 *
 * This method then renumbers the HI values so that they start at zero and have no gaps, because some
 * reads may have been filtered out.
 */
public void coordinateByHitIndex() {
    // Sort by HI value, with reads with no HI going at the end.
    Collections.sort(firstOfPairOrFragment, comparator);
    Collections.sort(secondOfPair, comparator);

    // Insert nulls as necessary in the two lists so that correlated alignments have the same index
    // and uncorrelated alignments have null in the other list at the corresponding index.
    for (int i = 0; i < Math.min(firstOfPairOrFragment.size(), secondOfPair.size()); ++i) {
        final Integer leftHi = firstOfPairOrFragment.get(i).getIntegerAttribute(SAMTag.HI.name());
        final Integer rightHi = secondOfPair.get(i).getIntegerAttribute(SAMTag.HI.name());
        if (leftHi != null) {
            if (rightHi != null) {
                if (leftHi < rightHi) secondOfPair.add(i, null);
                else if (rightHi < leftHi) firstOfPairOrFragment.add(i, null);
                // else the are correlated
            }
        } else if (rightHi != null) {
            firstOfPairOrFragment.add(i, null);
        } else {
            // Both alignments do not have HI, so push down the ones on the right.
            // Right is arbitrarily picked to push down.
            secondOfPair.add(i, null);
        }
    }

    // Now renumber any correlated alignments, and remove hit index if no correlated read.
    int hi = 0;
    for (int i = 0; i < numHits(); ++i) {
        final SAMRecord first = getFirstOfPair(i);
        final SAMRecord second = getSecondOfPair(i);
        if (first != null && second != null) {
            first.setAttribute(SAMTag.HI.name(), i);
            second.setAttribute(SAMTag.HI.name(), i);
            ++hi;
        } else if (first != null) {
            first.setAttribute(SAMTag.HI.name(), null);
        } else {
            second.setAttribute(SAMTag.HI.name(), null);
        }
    }
}
 
Example #30
Source File: BQSRReadTransformer.java    From gatk with BSD 3-Clause "New" or "Revised" License 4 votes vote down vote up
/**
 * Recalibrates the base qualities of a read
 * <p>
 * It updates the base qualities of the read with the new recalibrated qualities (for all event types)
 * <p>
 * Implements a serial recalibration of the reads using the combinational table.
 * First, we perform a positional recalibration, and then a subsequent dinuc correction.
 * <p>
 * Given the full recalibration table, we perform the following preprocessing steps:
 * <p>
 * - calculate the global quality score shift across all data [DeltaQ]
 * - calculate for each of cycle and dinuc the shift of the quality scores relative to the global shift
 * -- i.e., DeltaQ(dinuc) = Sum(pos) Sum(Qual) Qempirical(pos, qual, dinuc) - Qreported(pos, qual, dinuc) / Npos * Nqual
 * - The final shift equation is:
 * <p>
 * Qrecal = Qreported + DeltaQ + DeltaQ(pos) + DeltaQ(dinuc) + DeltaQ( ... any other covariate ... )
 *
 * @param originalRead the read to recalibrate
 */
@Override
public GATKRead apply(final GATKRead originalRead) {
    final GATKRead read = useOriginalBaseQualities ? ReadUtils.resetOriginalBaseQualities(originalRead) : originalRead;

    if (emitOriginalQuals && ! read.hasAttribute(SAMTag.OQ.name())) { // Save the old qualities if the tag isn't already taken in the read
        try {
            read.setAttribute(SAMTag.OQ.name(), SAMUtils.phredToFastq(read.getBaseQualities()));
        } catch (final IllegalArgumentException e) {
            throw new MalformedRead(read, "illegal base quality encountered; " + e.getMessage());
        }
    }

    final ReadCovariates readCovariates = RecalUtils.computeCovariates(read, header, covariates, false, keyCache);

    //clear indel qualities
    read.clearAttribute(ReadUtils.BQSR_BASE_INSERTION_QUALITIES);
    read.clearAttribute(ReadUtils.BQSR_BASE_DELETION_QUALITIES);

    // get the keyset for this base using the error model
    final int[][] fullReadKeySet = readCovariates.getKeySet(EventType.BASE_SUBSTITUTION);

    // the rg key is constant over the whole read, the global deltaQ is too
    final int rgKey = fullReadKeySet[0][0];

    final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get2Keys(rgKey, BASE_SUBSTITUTION_INDEX);

    if (empiricalQualRG == null) {
        return read;
    }
    final byte[] quals = read.getBaseQualities();

    final int readLength = quals.length;
    final double epsilon = globalQScorePrior > 0.0 ? globalQScorePrior : empiricalQualRG.getEstimatedQReported();

    final NestedIntegerArray<RecalDatum> qualityScoreTable = recalibrationTables.getQualityScoreTable();
    final List<Byte> quantizedQuals = quantizationInfo.getQuantizedQuals();

    //Note: this loop is under very heavy use in applyBQSR. Keep it slim.
    for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read

        // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality)
        if (quals[offset] < preserveQLessThan) {
            continue;
        }
        Arrays.fill(empiricalQualCovsArgs, null);  //clear the array
        final int[] keySet = fullReadKeySet[offset];


        final RecalDatum empiricalQualQS = qualityScoreTable.get3Keys(keySet[0], keySet[1], BASE_SUBSTITUTION_INDEX);

        for (int i = specialCovariateCount; i < totalCovariateCount; i++) {
            if (keySet[i] >= 0) {
                empiricalQualCovsArgs[i - specialCovariateCount] = recalibrationTables.getTable(i).get4Keys(keySet[0], keySet[1], keySet[i], BASE_SUBSTITUTION_INDEX);
            }
        }
        final double recalibratedQualDouble = hierarchicalBayesianQualityEstimate(epsilon, empiricalQualRG, empiricalQualQS, empiricalQualCovsArgs);

        final byte recalibratedQualityScore = quantizedQuals.get(getRecalibratedQual(recalibratedQualDouble));

        // Bin to static quals
        quals[offset] = staticQuantizedMapping == null ? recalibratedQualityScore : staticQuantizedMapping[recalibratedQualityScore];
    }
    read.setBaseQualities(quals);
    return read;
}