package org.broadinstitute.hellbender.engine.filters;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.CigarElement;
import htsjdk.samtools.CigarOperator;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
import org.broadinstitute.hellbender.utils.help.HelpConstants;
import org.broadinstitute.hellbender.utils.read.GATKRead;

/**
 * Filter out reads where the number of bases without soft-clips (M, I, X, and = CIGAR operators) is lower than a threshold.
 *
 * <p>This filter is intended to filter out reads that are potentially from foreign organisms.
 * From experience with sequencing of human DNA we have found cases of contamination by bacterial
 * organisms; the symptoms of such contamination are a class of reads with only a small number
 * of aligned bases and additionally many soft-clipped bases. This filter is intended
 * to remove such reads.</p>
 *
 * <p>Note: Consecutive soft-clipped blocks are treated as a single block. For example, 1S2S10M1S2S is treated as 3S10M3S</p>
 */
@DocumentedFeature(groupName= HelpConstants.DOC_CAT_READFILTERS, groupSummary=HelpConstants.DOC_CAT_READFILTERS_SUMMARY, summary = "Filter out reads that are over-soft-clipped")
public final class OverclippedReadFilter extends ReadFilter {
    static final long serialVersionUID = 1L;
    private final Logger logger = LogManager.getLogger(this.getClass());

    @Argument(fullName = ReadFilterArgumentDefinitions.FILTER_TOO_SHORT_NAME,
            doc = "Minimum number of aligned bases",
            optional = true)
    private int minAlignedBases = 30;

    @Argument(fullName = ReadFilterArgumentDefinitions.DONT_REQUIRE_SOFT_CLIPS_BOTH_ENDS_NAME,
            doc = "Allow a read to be filtered out based on having only 1 soft-clipped block. By default, both ends must " +
                    "have a soft-clipped block, setting this flag requires only 1 soft-clipped block",
            optional = true)
    private boolean doNotRequireSoftClipsOnBothEnds;

    // Command line parser requires a no-arg constructor
    public OverclippedReadFilter() {}

    @VisibleForTesting
    OverclippedReadFilter(final int minAlignedBases) {
        this.minAlignedBases = minAlignedBases;
    }

    @VisibleForTesting
    OverclippedReadFilter(final int minAlignedBases, final boolean doNotRequireSoftClipsOnBothEnds) {
        this.minAlignedBases = minAlignedBases;
        this.doNotRequireSoftClipsOnBothEnds = doNotRequireSoftClipsOnBothEnds;
    }

    @Override
    public boolean test(final GATKRead read) {
        int alignedLength = 0;
        int softClipBlocks = 0;
        final int minSoftClipBlocks = doNotRequireSoftClipsOnBothEnds ? 1 : 2;
        CigarOperator prevOperator = null;

        for ( final CigarElement element : read.getCigarElements() ) {
            if ( element.getOperator() == CigarOperator.S ) { // count total S blocks
                //Treat consecutive S blocks as a single one
                if(prevOperator != CigarOperator.S){
                    softClipBlocks += 1;
                }

            } else if ( element.getOperator().consumesReadBases() ) {   // M, I, X, and EQ (S was already accounted for above)
                alignedLength += element.getLength();
            }
            prevOperator = element.getOperator();
        }

        return (alignedLength >= minAlignedBases || softClipBlocks < minSoftClipBlocks);
    }
}