python source code of cats2d

PyBioMed-master
- PyBioMed
  - PyMolecule
    - molproperty.py
    - AtomProperty.py
    - bcut.py
    - Crippen.txt
    - ghosecrippen.py
    - moe.py
    - moran.py
    - topology.py
    - geary.py
    - connectivity.py
    - moreaubroto.py
    - basak.py
    - PubChemFingerprints.py
    - cats2d.py
    - AtomTypes.py
    - constitution.py
    - charge.py
    - __init__.py
    - Scaffolds.py
    - MinimalFeatures.fdef
    - fingerprint.py
    - estate.py
    - kappa.py
  - PyProtein
    - AAIndex.py
    - ConjointTriad.py
    - PyProteinAAComposition.py
    - aaindex3
    - ProCheck.py
    - AAComposition.py
    - __init__.py
    - PyProteinAAIndex.py
    - GetSubSeq.py
    - Autocorrelation.py
    - QuasiSequenceOrder.py
    - PseudoAAC.py
    - CTD.py
    - GetProteinFromUniprot.py
    - PyProtein.py
  - test
    - test_PyMolecule.py
    - test_PyBioMed.py
    - test_PyProtein.py
    - test_data
      - drug.sdf
      - test.mol
      - example.fasta
      - neg.fasta
      - pos.fasta
      - drug.mol
      - drug.inchi
      - target.txt
      - protein.fasta
      - hs.fasta
      - drug.smi
    - test_PyGetMol.py
    - test_PyInteration.py
    - __init__.py
    - test_PyDNA.py
    - test_PyPretreat.py
    - test.py
    - 1atp.ent.gz
  - download
  - PyDNA
    - PyDNAacutil.py
    - PyDNApsenacutil.py
    - mmc4.data
    - PyDNAnacutil.py
    - PyDNAnac.py
    - PyDNAutil.py
    - mmc3.data
    - __init__.py
    - PyDNAac.py
    - PyDNApsenac.py
    - PyDNA.py
  - Pydna.py
  - example
    - dna
    - dpi
      - DPI_SMIs.xlsx
    - subcell
    - solubility
      - Solubility-total.xlsx
    - caco2
      - caco2.xlsx
  - Pyinteraction.py
  - __init__.py
  - Pymolecule.py
  - PyGetMol
    - GetProtein.py
    - 1f88.ent.gz
    - Getmol.py
    - GetDNA.py
    - __init__.py
    - 1efz.ent.gz
    - 1atp.ent.gz
  - PyInteraction
    - PyInteraction.py
    - __init__.py
  - Pyprotein.py
  - doc
    - download.rst
    - Makefile
    - make.bat
    - 1f88.ent.gz
    - overview.rst
    - _build
      - doctrees
        modules.doctree
        User_guide.doctree
        overview.doctree
        test.doctree
        download.doctree
        reference
        PyPretreatMolutil.doctree
        PyDNAacutil.doctree
        PyInteraction.doctree
        Autocorrelation.doctree
        test_PyProtein.doctree
        constitution.doctree
        moran.doctree
        PyPretreatMol.doctree
        GetProteinFromUniprot.doctree
        basak.doctree
        test2.doctree
        GetDNA.doctree
        Getmol.doctree
        test_PyBioMed.doctree
        AAIndex.doctree
        AAComposition.doctree
        estate.doctree
        PyPreTools.doctree
        PyPretreatPro.doctree
        PyDNAac.doctree
        AtomTypes.doctree
        ProCheck.doctree
        PyDNAnacutil.doctree
        cats2d.doctree
        PyInteraction_module.doctree
        PyPretreat.doctree
        kappa.doctree
        test_PyGetMol.doctree
        PyProteinAAIndex.doctree
        ghosecrippen.doctree
        test_PyMolecule.doctree
        PyDNApsenacutil.doctree
        PyDNApsenac.doctree
        Scaffolds.doctree
        PyDNA.doctree
        ConjointTriad.doctree
        PyProteinclass.doctree
        PyDNAnac.doctree
        test.doctree
        PyProteinAAComposition.doctree
        PyProtein.doctree
        fingerprint.doctree
        PseudoAAC.doctree
        moreaubroto.doctree
        PyPretreatDNA.doctree
        AtomProperty.doctree
        charge.doctree
        PubChemFingerprints.doctree
        moe.doctree
        PyMolecule.doctree
        test_PyPretreat.doctree
        molproperty.doctree
        test_PyInteration.doctree
        GetSubSeq.doctree
        bcut.doctree
        PyGetMol.doctree
        test_PyDNA.doctree
        GetProtein.doctree
        topology.doctree
        PyDNAutil.doctree
        geary.doctree
        CTD.doctree
        application.doctree
        index.doctree
      - html
        _sources
        application.txt
        index.txt
        download.txt
        test.txt
        modules.txt
        reference
        PseudoAAC.txt
        PyPretreatPro.txt
        PyProtein.txt
        PyDNAacutil.txt
        AAComposition.txt
        PyPreTools.txt
        GetSubSeq.txt
        test_PyDNA.txt
        test_PyProtein.txt
        PyGetMol.txt
        PyInteraction.txt
        AAIndex.txt
        Getmol.txt
        PyMolecule.txt
        PubChemFingerprints.txt
        basak.txt
        geary.txt
        estate.txt
        cats2d.txt
        GetProtein.txt
        GetProteinFromUniprot.txt
        test_PyGetMol.txt
        connectivity.txt
        PyPretreat.txt
        constitution.txt
        ProCheck.txt
        PyDNAutil.txt
        PyDNApsenac.txt
        test.txt
        PyProteinclass.txt
        kappa.txt
        PyDNA.txt
        moe.txt
        PyDNAac.txt
        Scaffolds.txt
        charge.txt
        AtomTypes.txt
        test_PyInteration.txt
        fingerprint.txt
        PyProteinAAComposition.txt
        test2.txt
        PyDNAnacutil.txt
        test_PyMolecule.txt
        PyInteraction_module.txt
        moran.txt
        topology.txt
        test_PyPretreat.txt
        AtomProperty.txt
        PyDNApsenacutil.txt
        PyProteinAAIndex.txt
        ghosecrippen.txt
        QuasiSequenceOrder.txt
        moreaubroto.txt
        bcut.txt
        PyPretreatDNA.txt
        ConjointTriad.txt
        PyPretreatMol.txt
        Autocorrelation.txt
        GetDNA.txt
        PyDNAnac.txt
        molproperty.txt
        test_PyBioMed.txt
        CTD.txt
        PyPretreatMolutil.txt
        overview.txt
        User_guide.txt
        searchindex.js
        _modules
        ghosecrippen.html
        PyProteinAAIndex.html
        AtomProperty.html
        ConjointTriad.html
        PyDNAacutil.html
        fingerprint.html
        moreaubroto.html
        PyDNAnac.html
        constitution.html
        moe.html
        GetProtein.html
        test_PyMolecule.html
        PyDNAnacutil.html
        geary.html
        AtomTypes.html
        bcut.html
        Scaffolds.html
        basak.html
        topology.html
        PyPretreatMol.html
        CTD.html
        GetProteinFromUniprot.html
        PyDNAac.html
        connectivity.html
        PyInteraction.html
        ProCheck.html
        estate.html
        test_PyPretreat.html
        Autocorrelation.html
        PyPretreatPro.html
        PyProtein.html
        cats2d.html
        moran.html
        GetDNA.html
        PyDNApsenacutil.html
        charge.html
        GetSubSeq.html
        test_PyGetMol.html
        PyPretreatDNA.html
        test_PyDNA.html
        molproperty.html
        test_PyProtein.html
        index.html
        PyProteinAAComposition.html
        QuasiSequenceOrder.html
        PyDNAutil.html
        test_PyInteration.html
        kappa.html
        test_PyBioMed.html
        Getmol.html
        AAComposition.html
        PyPreTools.html
        PyDNApsenac.html
        AAIndex.html
        PseudoAAC.html
        _images
        application.html
        test.html
        .buildinfo
        overview.html
        objects.inv
        User_guide.html
        genindex.html
        reference
        ghosecrippen.html
        PyProteinAAIndex.html
        PyGetMol.html
        AtomProperty.html
        ConjointTriad.html
        PyDNAacutil.html
        PyMolecule.html
        fingerprint.html
        PyInteraction_module.html
        moreaubroto.html
        PyDNAnac.html
        constitution.html
        moe.html
        GetProtein.html
        test_PyMolecule.html
        PyDNAnacutil.html
        geary.html
        AtomTypes.html
        bcut.html
        Scaffolds.html
        basak.html
        topology.html
        PyPretreatMol.html
        PubChemFingerprints.html
        PyPretreat.html
        CTD.html
        GetProteinFromUniprot.html
        test.html
        PyDNAac.html
        connectivity.html
        PyInteraction.html
        ProCheck.html
        estate.html
        test_PyPretreat.html
        Autocorrelation.html
        test2.html
        PyPretreatPro.html
        PyProtein.html
        cats2d.html
        PyProteinclass.html
        moran.html
        GetDNA.html
        PyDNApsenacutil.html
        PyPretreatMolutil.html
        charge.html
        GetSubSeq.html
        test_PyGetMol.html
        PyPretreatDNA.html
        PyDNA.html
        test_PyDNA.html
        molproperty.html
        test_PyProtein.html
        PyProteinAAComposition.html
        QuasiSequenceOrder.html
        PyDNAutil.html
        test_PyInteration.html
        kappa.html
        test_PyBioMed.html
        Getmol.html
        AAComposition.html
        PyPreTools.html
        PyDNApsenac.html
        AAIndex.html
        PseudoAAC.html
        py-modindex.html
        index.html
        search.html
        modules.html
        _static
        ajax-loader.gif
        jquery.js
        basic.css
        pygments.css
        searchtools.js
        sphinxdoc.css
        doctools.js
        websupport.js
        underscore-1.3.1.js
        underscore.js
        download.html
    - application.rst
    - reference
      - moreaubroto.rst
      - PubChemFingerprints.rst
      - PyDNAac.rst
      - ProCheck.rst
      - kappa.rst
      - ConjointTriad.rst
      - PyProteinAAComposition.rst
      - PyDNAacutil.rst
      - geary.rst
      - moe.rst
      - PyDNA.rst
      - AAIndex.rst
      - PyPretreatDNA.rst
      - PyDNAnacutil.rst
      - PyDNAutil.rst
      - moran.rst
      - PyMolecule.rst
      - ghosecrippen.rst
      - test2.rst
      - QuasiSequenceOrder.rst
      - AAComposition.rst
      - PyGetMol.rst
      - GetDNA.rst
      - bcut.rst
      - GetSubSeq.rst
      - molproperty.rst
      - PyProteinAAIndex.rst
      - PyDNAnac.rst
      - PyInteraction_module.rst
      - GetProtein.rst
      - PyProtein.rst
      - PyPretreat.rst
      - test_PyMolecule.rst
      - PyPretreatPro.rst
      - AtomProperty.rst
      - test_PyInteration.rst
      - PseudoAAC.rst
      - CTD.rst
      - constitution.rst
      - GetProteinFromUniprot.rst
      - test_PyPretreat.rst
      - Getmol.rst
      - PyDNApsenacutil.rst
      - fingerprint.rst
      - test_PyProtein.rst
      - PyInteraction.rst
      - PyPretreatMol.rst
      - topology.rst
      - test_PyDNA.rst
      - Scaffolds.rst
      - test_PyBioMed.rst
      - test_PyGetMol.rst
      - connectivity.rst
      - AtomTypes.rst
      - charge.rst
      - Autocorrelation.rst
      - basak.rst
      - PyProteinclass.rst
      - PyDNApsenac.rst
      - PyPretreatMolutil.rst
      - estate.rst
      - cats2d.rst
      - PyPreTools.rst
      - test.rst
    - modules.rst
    - index.rst
    - conf.py
    - Descriptor
    - image
    - User_guide.rst
    - test.rst
  - PyPretreat
    - PyPretreatPro.py
    - PyPreTools.py
    - PyDNAutil.py
    - PyPretreatMolutil.py
    - PyPretreatMol.py
    - __init__.py
    - PyPretreatDNA.py
- conda-env-38.yml
- conda-env-27.yml
- .pre-commit-config.yaml
- MANIFEST
- version.py
- .gitattributes
- setup.py
- setup.cfg
- .travis.yml
- README.md
- .isort.cfg
- .gitignore
- LICENSE.txt

# -*- coding: utf-8 -*-
#  Copyright (c) 2016-2017, Zhijiang Yao, Jie Dong and Dongsheng Cao
#  All rights reserved.
#  This file is part of the PyBioMed.
#  The contents are covered by the terms of the BSD license
#  which is included in the file license.txt, found at the root
#  of the PyBioMed source tree.
"""
# CATS2D  Potential Pharmacophore Point (PPP) definitions as describes in
# Pharmacophores and Pharmacophore Searches 2006 (Eds. T. Langer and R.D. Hoffmann), Chapter 3:
# Alignment-free Pharmacophore Patterns - A Correlattion-vector Approach.
# The last lipophilic pattern on page 55 of the book is realized as a graph search and not
# as a SMARTS search. Therefore, the list contains only two lipophilic SMARTS patterns.
# The format is tab separated and contains in the first column the PPP type (D = H-bond donor,
# A = H-bond acceptor, P = positive, N = negative, L = lipophilic). The second column of each entry
# contains the SMARTS pattern(s). The last entry is a description of the molecular feature

D	[OH]	Oxygen atom of an OH group
D	[#7H,#7H2]	Nitrogen atom of an NH or NH2 group
A	[O]	Oxygen atom
A	[#7H0]	Nitrogen atom not adjacent to a hydrogen atom
P	[*+]	atom with a positive charge
P	[#7H2]	Nitrogen atom of an NH2 group
N	[*-]	Atom with a negative charge
N	[C&D2&$(C(=O)O),P&D2&$(P(=O)O),S&D2&$(S(=O)O)] 	Carbon, sulfur or phosphorus atom of a COOH, SOOH or POOH group. This pattern is realized by an graph algorithm
L	[Cl,Br,I]	Chlorine, bromine, or iodine atom
L	[S;D2;$(S(C)(C))]	Sulfur atom adjacent to exactly two carbon atoms


Created on Thu Sep  1 20:13:38 2016

Authors: Zhijiang Yao and Dongsheng Cao.

Email: gadsby@163.com and oriental-cds@163.com

"""
# Third party modules
###############################################################################
import scipy
from rdkit import Chem

PPP = {
    "D": ["[OH]", "[#7H,#7H2]"],
    "A": ["[O]", "[#7H0]"],
    "P": ["[*+]", "[#7H2]"],
    "N": ["[*-]", "[C&$(C(=O)O)]", "[P&$(P(=O)O)]", "[S&$(S(=O)O)]"],
    "L": ["[Cl,Br,I]", "[S;D2;$(S(C)(C))]"],
}

Version = 1.0
###############################################################################
def MatchAtomType(IndexList, AtomTypeDict):
    """
    #################################################################
    Mapping two atoms with a certain distance into their atom types

    such as AA,AL, DP,LD etc.

    The result is a list format.
    #################################################################
    """
    First = []
    Second = []
    for i in AtomTypeDict:
        if IndexList[0] in AtomTypeDict[i]:
            First.append(i)
        if IndexList[1] in AtomTypeDict[i]:
            Second.append(i)

    temp = []
    for i in First:
        for j in Second:
            temp.append(i + j)

    temp1 = []
    for i in temp:
        if i in ["AD", "PD", "ND", "LD", "PA", "NA", "LA", "NP", "LN", "LP"]:
            temp1.append(i[1] + i[0])
        else:
            temp1.append(i)

    res = []
    for i in temp1:
        if i not in res:
            res.append(i)

    return res


###################################
def ContructLFromGraphSearch(mol):
    """
    #################################################################
    The last lipophilic pattern on page 55 of the book is realized as a graph
    search and not as a SMARTS search.

    "L" carbon atom adjacent only to carbon atoms.

    The result is a list format.
    #################################################################
    """

    AtomIndex = []
    Hmol = Chem.RemoveHs(mol)
    for atom in Hmol.GetAtoms():
        temp = []
        if atom.GetAtomicNum() == 6:
            for neighatom in atom.GetNeighbors():
                if neighatom.GetAtomicNum() == 6:
                    temp.append(0)
                elif neighatom.GetAtomicNum() == 1:
                    continue
                else:
                    temp.append(1)
            if sum(temp) == 0:
                AtomIndex.append(atom.GetIdx())

    return AtomIndex


###################################
def FormCATSLabel(PathLength=10):

    """
    #################################################################
    Construct the CATS label such as AA0, AA1,....AP3,.......

    The result is a list format.

    A   acceptor;
    P   positive;
    N   negative;
    L   lipophilic;
    D   donor;
    #################################################################
    """
    AtomPair = [
        "DD",
        "DA",
        "DP",
        "DN",
        "DL",
        "AA",
        "AP",
        "AN",
        "AL",
        "PP",
        "PN",
        "PL",
        "NN",
        "NL",
        "LL",
    ]
    CATSLabel = []
    for i in AtomPair:
        for k in range(PathLength):
            CATSLabel.append("CATS_" + i + str(k))
    return CATSLabel


###################################


def FormCATSDict(AtomDict, CATSLabel):
    """
    #################################################################
    Construt the CATS dict.

    The result is a dict format.
    #################################################################
    """

    temp = []
    for i in AtomDict:
        for j in AtomDict[i]:
            if len(j) == 0:
                continue
            else:
                temp.append(j + str(i))

    res = dict()
    for i in set(temp):
        res.update({"CATS_" + i: temp.count(i)})

    result = dict(zip(CATSLabel, [0 for i in CATSLabel]))
    result.update(res)

    return result


###################################


def AssignAtomType(mol):
    """
    #################################################################
    Assign the atoms in the mol object into each of the PPP type

    according to PPP list definition.

    Note: res is a dict form such as {'A': [2], 'P': [], 'N': [4]}
    #################################################################
    """
    res = dict()
    for ppptype in PPP:
        temp = []
        for i in PPP[ppptype]:
            patt = Chem.MolFromSmarts(i)
            atomindex = mol.GetSubstructMatches(patt)
            atomindex = [k[0] for k in atomindex]
            temp.extend(atomindex)
        res.update({ppptype: temp})
    temp = ContructLFromGraphSearch(mol)
    temp.extend(res["L"])
    res.update({"L": temp})

    return res


###################################
def CATS2D(mol, PathLength=10, scale=3):
    """
    #################################################################
    The main program for calculating the CATS descriptors.

    CATS: chemically advanced template serach

    ----> CATS_DA0 ....

    Usage:

        result=CATS2D(mol,PathLength = 10,scale = 1)

        Input: mol is a molecule object.

               PathLength is the max topological distance between two atoms.

               scale is the normalization method (descriptor scaling method)

               scale = 1 indicates that no normalization. That is to say: the

               values of the vector represent raw counts ("counts").

               scale = 2 indicates that division by the number of non-hydrogen

               atoms (heavy atoms) in the molecule.

               scale = 3 indicates that division of each of 15 possible PPP pairs

               by the added occurrences of the two respective PPPs.

        Output: result is a dict format with the definitions of each descritor.
    #################################################################
    """
    Hmol = Chem.RemoveHs(mol)
    AtomNum = Hmol.GetNumAtoms()
    atomtype = AssignAtomType(Hmol)
    DistanceMatrix = Chem.GetDistanceMatrix(Hmol)
    DM = scipy.triu(DistanceMatrix)
    tempdict = {}
    for PL in range(0, PathLength):
        if PL == 0:
            Index = [[k, k] for k in range(AtomNum)]
        else:
            Index1 = scipy.argwhere(DM == PL)
            Index = [[k[0], k[1]] for k in Index1]
        temp = []
        for j in Index:
            temp.extend(MatchAtomType(j, atomtype))
        tempdict.update({PL: temp})

    CATSLabel = FormCATSLabel(PathLength)
    CATS1 = FormCATSDict(tempdict, CATSLabel)

    ####set normalization 3
    AtomPair = [
        "DD",
        "DA",
        "DP",
        "DN",
        "DL",
        "AA",
        "AP",
        "AN",
        "AL",
        "PP",
        "PN",
        "PL",
        "NN",
        "NL",
        "LL",
    ]
    temp = []
    for i, j in tempdict.items():
        temp.extend(j)

    AtomPairNum = {}
    for i in AtomPair:
        AtomPairNum.update({i: temp.count(i)})
    ############################################
    CATS = {}
    if scale == 1:
        CATS = CATS1
    if scale == 2:
        for i in CATS1:
            CATS.update({i: round(CATS1[i] / (AtomNum + 0.0), 3)})
    if scale == 3:
        for i in CATS1:
            if AtomPairNum[i[5:7]] == 0:
                CATS.update({i: round(CATS1[i], 3)})
            else:
                CATS.update({i: round(CATS1[i] / (AtomPairNum[i[5:7]] + 0.0), 3)})

    return CATS


###############################################################################
if __name__ == "__main__":

    import string
    import os

    #    import pandas as pd
    smif = ["CCCC", "CCCCC", "CCCCCC", "CC(N)C(=O)O", "CC(N)C(=O)[O-].[Na+]"]
    AllDes = []
    for i in smif:
        mol = Chem.MolFromSmiles(i)
        cats = CATS2D(mol, PathLength=10, scale=3)
        AllDes.append(cats)
    print(AllDes)