python source code of mcannotate

#!/usr/bin/python
import sys
import copy

import mirge.forgi.utilities.debug as fud

def parse_resid(mcann_resid):
    '''
    Read in an MC-Annotate formatted resid and return a tuple
    that can be used as an index into a Bio.PDB.Chain.
    '''
    parts = mcann_resid.split('.')
    if len(parts) == 1:
        return (' ', int(parts[0]), ' ')
    else:
        return (' ', int(parts[0]), parts[1])

def format_resid(pdb_resid):
    '''
    Convert a PDB.Chain.Residue id to an MC-Annotate formatted
    residue identifier.
    '''
    if pdb_resid[2] == ' ':
        return str(pdb_resid[1])
    else:
        return str(pdb_resid[1]) + "." + pdb_resid[2]

def parse_chain_base(chain_base):
    """
    Parse the string identifying a chain and a base in an MC-Annotate generated
    annotation file.

    As an example, the string 'A33' means residue 33 on chain 'A'.

    @param chain_base: The MC-Annotate identification string (i.e. 'A33')
    @return: A pair containing the chain id and residue number (i.e. ('A', 33))
    """
    if (ord(chain_base[0]) >= ord('A') and ord(chain_base[0]) <= ord('Z')) or (ord(chain_base[0]) >= ord('a') and ord(chain_base[0]) <= ord('z')):
        # normal string
        chain = chain_base[0]
        base = chain_base[1:]
    else:
        # quoted string (i.e. ''33'')
        if chain_base[0] == '\'':
            end_quote_idx = chain_base.find('\'', 1)
            chain = chain_base[1:end_quote_idx]
            base = chain_base[end_quote_idx+1:]
        else:
            # no chain identifier
            chain = ''
            base = chain_base

    return (chain, base)

def parse_base_pair_id(base_pair_id):
    """
    Separate the two chain/base identifiers present in the interaction section of
    an MC-Annotate output file.

    @param base_pair_id: The identifier string for the interacting nucleotides (i.e. 'A33-B45')
    @return: 4-tuple containing of the form (chain1, res1, chain2, res2) i.e. ('A', 33, 'B', '45')
    """

    parts = base_pair_id.split('-')
    
    if len(parts) > 2:
        raise Exception("Invalid interaction in the MC-Annotate file: %s" % base_pair_id)

    (from_chain, from_base) = parse_chain_base(parts[0].strip())
    (to_chain, to_base) = parse_chain_base(parts[1].strip())

    return (from_chain, from_base, to_chain, to_base)

def get_interacting_base_pairs(line):
    """
    Return the identification part of an interaction line of an MC-Annotate output file.

    @param line: The entire interaction line
    @return: The first part, containing the identifiers of the interacting entities (i.e. 'A33-B45')
    """
    line_parts = line.split(' ')
    return parse_base_pair_id(line_parts[0])

def iterate_over_residue_list(mcannotate_lines):
    """
    Generator function allowing the iteration over all of the lines in
    the 'Residue conformations' section of an MC-Annotate output file.

    @param mcannotate_lines: All of the lines of an MC-Annotate output file
    @return: Yield only lines in the 'Residue conformations' section
    """
    residue_conf_line = False
    for line in mcannotate_lines:
        if line.find('Residue conformations') == 0:
            residue_conf_line = True
            continue
        if line.find('Adjacent stacking') == 0:
            residue_conf_line = False
            continue
        if residue_conf_line:
            yield line

def iterate_over_interactions(mcannotate_lines):
    """
    Generator function for the iteration over lines in the 'Base-pairs' section of an
    MC-Annotate output file.

    @param mcannotate_lines: All of the lines of an MC-Annotate output file
    @return: Yield only lines in the 'Base-pairs' section
    """
    base_pair_line = False
    for line in mcannotate_lines:
        if line.find("Base-pairs ---") == 0:
            base_pair_line = True
            continue 
        if line.find("Residue conformations") == 0:
            base_pair_line = False
            continue
        if base_pair_line:
            try:
                (from_chain, from_base, to_chain, to_base) =  get_interacting_base_pairs(line)
            except ValueError as ve:
                print >> sys.stderr, "ValueError:", ve
                print >> sys.stderr, "line:", line
                continue

            yield line.strip()

# From:
# http://code.activestate.com/recipes/389639/
class DefaultDict(dict):
    """Dictionary with a default value for unknown keys."""
    def __init__(self, default):
        self.default = default

    def __getitem__(self, key):
        if key in self: 
            return self.get(key)
        else:
            ## Need copy in case self.default is something like []
            return self.setdefault(key, copy.deepcopy(self.default))

    def __copy__(self):
        copy = DefaultDict(self.default)
        copy.update(self)
        return copy

def get_dotplot(lines):
    """docstring for get_dotplot"""
    residues = []
    residue_types = []
    bps = DefaultDict(-1)
    output_str = ""

    for line in iterate_over_residue_list(lines):
        parts = line.split(' ')
        residues += [parts[0]]
        residue_types += [parts[2]]

    paired = set()
    for line in iterate_over_interactions(lines):
        parts = line.split(' ')
        bond_type = parts[3]
        #if bond_type.find('Ww/Ww') >= 0 or bond_type.find('Ww/Ws') >= 0 or bond_type.find('Ws/Ww') >= 0:
        if ((line.find('Ww/Ww') >= 0 and (line.find('A-U') >= 0 or
                                        line.find('U-A') >= 0 or
                                        line.find('C-G') >= 0 or
                                        line.find('G-C') >= 0)) or
           (line.find('Ws/Ww') >= 0 and line.find('U-G') >= 0) or
            (line.find('Ww/Ws') >= 0 and line.find('G-U') >= 0)):
            #if bond_type.find('Ww/Ww') >= 0:
            parts1 = parts[0].split('-')
            #print line

            if parts1[0] in paired or parts1[1] in paired:
                print >>sys.stderr, "paired:", parts1[0], parts1[1]
                continue

            paired.add(parts1[0])
            paired.add(parts1[1])

            bps[parts1[0]] = residues.index(parts1[1])
            bps[parts1[1]] = residues.index(parts1[0])


    for i in range(len(residue_types)):
        output_str += "%d %s %s\n" % ( i+1, residue_types[i], bps[residues[i]]+1)

    return (output_str, residues)