Python rdkit.Chem.SDMolSupplier() Examples

The following are 15 code examples of rdkit.Chem.SDMolSupplier(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module rdkit.Chem , or try the search function .
Example #1
Source File: converter.py    From 3DGCN with MIT License 7 votes vote down vote up
def rotate_molecule(path, target_path, count=10):
    # Load dataset
    mols = Chem.SDMolSupplier(path)
    rotated_mols = []

    print("Loaded {} Molecules from {}".format(len(mols), path))

    print("Rotating Molecules...")
    for mol in mols:
        for _ in range(count):
            for atom in mol.GetAtoms():
                atom_idx = atom.GetIdx()

                pos = list(mol.GetConformer().GetAtomPosition(atom_idx))
                pos_rotated = np.matmul(random_rotation_matrix(), pos)

                mol.GetConformer().SetAtomPosition(atom_idx, pos_rotated)

            rotated_mols.append(mol)

    w = Chem.SDWriter(target_path)
    for m in rotated_mols:
        if m is not None:
            w.write(m)
    print("Saved {} Molecules to {}".format(len(rotated_mols), target_path)) 
Example #2
Source File: curve.py    From 3DGCN with MIT License 6 votes vote down vote up
def draw_confusion_matrix(dataset, model, set_trial=None, filename="test_results.sdf"):
    path = find_average_trial(dataset, model, metric="test_pr") if set_trial is None \
        else "../result/{}/{}/{}/".format(model, dataset, set_trial)

    # Load true, pred value
    true_y, pred_y = [], []
    mols = Chem.SDMolSupplier(path + filename)

    for mol in mols:
        true_y.append(float(mol.GetProp("true")))
        pred_y.append(float(mol.GetProp("pred")))

    true_y = np.array(true_y, dtype=float)
    pred_y = np.array(pred_y, dtype=float).round()

    # Get precision and recall
    confusion = confusion_matrix(true_y, pred_y)
    tn, fp, fn, tp = confusion.ravel()

    print("tn: {}, fp: {}, fn: {}, tp: {}".format(tn, fp, fn, tp)) 
Example #3
Source File: save.py    From PADME with MIT License 6 votes vote down vote up
def load_sdf_files(input_files, clean_mols):
  """Load SDF file into dataframe."""
  dataframes = []
  for input_file in input_files:
    # Tasks are stored in .sdf.csv file
    raw_df = next(load_csv_files([input_file + ".csv"], shard_size=None))
    # Structures are stored in .sdf file
    print("Reading structures from %s." % input_file)
    suppl = Chem.SDMolSupplier(str(input_file), clean_mols, False, False)
    df_rows = []
    for ind, mol in enumerate(suppl):
      if mol is not None:
        smiles = Chem.MolToSmiles(mol)
        df_rows.append([ind, smiles, mol])
    mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol'))
    dataframes.append(pd.concat([mol_df, raw_df], axis=1, join='inner'))
  return dataframes 
Example #4
Source File: Getmol.py    From PyBioMed with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def ReadMolFromSDF(filename=""):
    """
    Read a set of molecules by SDF file format.

    Note: the output of this function is a set of molecular objects.

    You need to use for statement to call each object.

    Usage:

        res=ReadMolFromSDF(filename)

        Input: filename is a file name with path.

        Output: res is a set of molecular object.

    """
    molset = Chem.SDMolSupplier(filename)
    return molset 
Example #5
Source File: sdf_file_parser.py    From chainer-chemistry with MIT License 6 votes vote down vote up
def extract_total_num(self, filepath):
        """Extracts total number of data which can be parsed

        We can use this method to determine the value fed to `target_index`
        option of `parse` method. For example, if we want to extract input
        feature from 10% of whole dataset, we need to know how many samples
        are in a file. The returned value of this method may not to be same as
        the final dataset size.

        Args:
            filepath (str): file path of to check the total number.

        Returns (int): total number of dataset can be parsed.

        """
        mol_supplier = Chem.SDMolSupplier(filepath)
        return len(mol_supplier) 
Example #6
Source File: predict_enriched.py    From PIDGINv3 with GNU General Public License v3.0 6 votes vote down vote up
def importQuerySDF(in_file):
	outfp = []
	outmol = []
	query = Chem.SDMolSupplier(in_file)
	for idx, m in enumerate(suppl):
		sys.stdout.write(' Importing SDF file. Compound number: %s\r' % idx)
		sys.stdout.flush()
		try:
			if not m: raise SdfNoneMolError('None mol')
			smi, fp, mol = calcFingerprints(m,qtype='sdf')
			outfp.append(fp)
			outmol.append(mol)
		except SdfNoneMolError: print ' SDF parse error (compound index: ' + str(idx) + ')'
	print
	return np.array(outfp,dtype=np.uint8),outmol

#unzip a pkl model 
Example #7
Source File: predict.py    From PIDGINv3 with GNU General Public License v3.0 6 votes vote down vote up
def importQuerySDF(in_file):
	outfp = []
	outid= []
	outmol = []
	query = Chem.SDMolSupplier(in_file)
	for idx, m in enumerate(suppl):
		sys.stdout.write(' Importing SDF file. Compound number: %s\r' % idx)
		sys.stdout.flush()
		try:
			if not m: raise SdfNoneMolError('None mol')
			smi, fp, mol = calcFingerprints(m,qtype='sdf')
			try: outid.append(m.GetProp('_Name'))
			except KeyError: outid.append(smi)
			outfp.append(fp)
			outmol.append(mol)
		except SdfNoneMolError: print ' SDF parse error (compound index: ' + str(idx) + ')'
	print
	return np.array(outfp,dtype=np.uint8),outmol,outid

#unzip a pkl model 
Example #8
Source File: dataset.py    From 3DGCN with MIT License 5 votes vote down vote up
def replace_dataset(self, path, subset="test", target_name="target"):
        x, c, y = [], [], []
        mols = Chem.SDMolSupplier(path)

        for mol in mols:
            if mol is not None:
                # Multitask
                if type(target_name) is list:
                    y.append([float(mol.GetProp(t)) if t in mol.GetPropNames() else -1 for t in target_name])
                    self.outputs = len(self.target_name)

                # Singletask
                elif target_name in mol.GetPropNames():
                    _y = float(mol.GetProp(target_name))
                    if _y == -1:
                        continue
                    else:
                        y.append(_y)

                else:
                    continue

                x.append(mol)
                c.append(mol.GetConformer().GetPositions())

        # Normalize
        x = np.array(x)
        c = np.array(c)
        y = (np.array(y) - self.mean) / self.std

        self.x[subset] = x
        self.c[subset] = c
        self.y[subset] = y.astype(int) if self.task != "regression" else y 
Example #9
Source File: scatter_plot.py    From 3DGCN with MIT License 5 votes vote down vote up
def find_confusion(dataset, base_path):
    for i in range(1, 11):
        path = base_path + "trial_{}/".format(i)

        # Load true, pred value
        true_y, pred_y, diff_y = [], [], []

        mols = Chem.SDMolSupplier(path + "test.sdf")
        for mol in mols:
            diff_y.append(float(mol.GetProp("true")) - float(mol.GetProp("pred")))

        diff_y = np.array(diff_y, dtype=float)

        # Find largest, smallest error molecules
        idx = np.argsort(diff_y)
        top_1 = mols[int(idx[-1])]
        top_2 = mols[int(idx[-2])]
        btm_1 = mols[int(idx[0])]
        btm_2 = mols[int(idx[1])]

        best_idx = np.argsort(np.abs(diff_y))
        best = mols[int(best_idx[0])]

        # Save example molecules
        writer = Chem.SDWriter(path + "confusion_examples_" + dataset + "_trial" + str(i) + ".sdf")
        for mol in [top_1, top_2, btm_1, btm_2, best]:
            writer.write(mol) 
Example #10
Source File: save.py    From deepchem with MIT License 5 votes vote down vote up
def load_sdf_files(input_files, clean_mols, tasks=[]):
  """Load SDF file into dataframe."""
  from rdkit import Chem
  dataframes = []
  for input_file in input_files:
    # Tasks are either in .sdf.csv file or in the .sdf file itself
    has_csv = os.path.isfile(input_file + ".csv")
    # Structures are stored in .sdf file
    print("Reading structures from %s." % input_file)
    suppl = Chem.SDMolSupplier(str(input_file), clean_mols, False, False)
    df_rows = []
    for ind, mol in enumerate(suppl):
      if mol is None:
        continue
      smiles = Chem.MolToSmiles(mol)
      df_row = [ind, smiles, mol]
      if not has_csv:  # Get task targets from .sdf file
        for task in tasks:
          df_row.append(mol.GetProp(str(task)))
      df_rows.append(df_row)
    if has_csv:
      mol_df = pd.DataFrame(df_rows, columns=('mol_id', 'smiles', 'mol'))
      raw_df = next(load_csv_files([input_file + ".csv"], shard_size=None))
      dataframes.append(pd.concat([mol_df, raw_df], axis=1, join='inner'))
    else:
      mol_df = pd.DataFrame(
          df_rows, columns=('mol_id', 'smiles', 'mol') + tuple(tasks))
      dataframes.append(mol_df)
  return dataframes 
Example #11
Source File: rdk.py    From oddt with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def readstring(format, string, **kwargs):
    """Read in a molecule from a string.

    Required parameters:
       format - see the informats variable for a list of available
                input formats
       string

    Example:
    >>> input = "C1=CC=CS1"
    >>> mymol = readstring("smi", input)
    >>> len(mymol.atoms)
    5
    """
    string = str(string)
    format = format.lower()
    if format in ["mol", "sdf"]:
        supplier = Chem.SDMolSupplier()
        supplier.SetData(string)
        mol = next(supplier)
        del supplier
    elif format == "mol2":
        mol = Chem.MolFromMol2Block(string, **kwargs)
    elif format == "pdb":
        mol = MolFromPDBBlock(string, **kwargs)
    elif format == 'pdbqt':
        mol = MolFromPDBQTBlock(string, **kwargs)
    elif format == "smi":
        s = string.strip().split('\n')[0].strip().split()
        mol = Chem.MolFromSmiles(s[0], **kwargs)
        if mol:
            mol.SetProp("_Name", ' '.join(s[1:]))
    elif format == 'inchi' and Chem.INCHI_AVAILABLE:
        mol = Chem.inchi.MolFromInchi(string, **kwargs)
    else:
        raise ValueError("%s is not a recognised RDKit format" % format)
    return Molecule(mol) 
Example #12
Source File: spectra_predictor.py    From deep-molecular-massspec with Apache License 2.0 5 votes vote down vote up
def get_mol_list_from_sdf(sdf_fname):
  """Reads a sdf file and returns a list of molecules.

  Note: rdkit's Chem.SDMolSupplier only accepts filenames as inputs. As such
  this code only supports local filesystem name environments.

  Args:
    sdf_fname: Path to sdf file.

  Returns:
    List of rdkit.Mol objects.

  Raises:
    ValueError if a molblock in the SDF cannot be parsed.
  """
  suppl = Chem.SDMolSupplier(sdf_fname)
  mols = []

  for idx, mol in enumerate(suppl):
    if mol is not None:
      mols.append(mol)
    else:
      fail_sdf_block = suppl.GetItemText(idx)
      raise ValueError("Unable to parse the following mol block %s" %
                       fail_sdf_block)
  return mols 
Example #13
Source File: chopRDKit03.py    From eMolFrag with GNU General Public License v3.0 5 votes vote down vote up
def FragmentSanitize(tempSDFPath):
    try:
        suppl2 = Chem.SDMolSupplier(tempSDFPath,sanitize=True)
        newmol2=Chem.FragmentOnBRICSBonds(suppl2[0])
        mfl=Chem.GetMolFrags(newmol2,asMols=True,sanitizeFrags=False)
        #print('Good True')
        return mfl
    except:
        #print('Not good for true')
        raise RDKitError(1) 
Example #14
Source File: converter.py    From 3DGCN with MIT License 4 votes vote down vote up
def converter(path, target_path, name, target_name, process=20):
    # Load dataset
    print("Loading Dataset...")
    if ".csv" in path:
        x, y = load_csv(path, name, target_name)
        mols, props = [], []
        for smi, prop in zip(x, y):
            mol = Chem.MolFromSmiles(smi)
            if mol is not None:
                mols.append(mol)
                props.append(prop)
        mol_idx = list(range(len(mols)))

    elif ".sdf" in path:
        mols = Chem.SDMolSupplier(path)

        props = []
        for mol in mols:
            props.append(mol.GetProp(target_name))
        mol_idx = list(range(len(mols)))

    else:
        raise ValueError("Unsupported file type.")
    print("Loaded {} Molecules from {}".format(len(mols), path))

    # Optimize coordinate using multiprocessing
    print("Optimizing Conformers...")
    pool = mp.Pool(process)
    results = pool.starmap(optimize_conformer, zip(mol_idx, mols, props))

    # Collect results
    mol_list, prop_list = [], []
    for mol, prop in results:
        mol_list.append(mol)
        prop_list.append(prop)

    # Remove None and add properties
    mol_list_filtered = []
    for mol, prop in zip(mol_list, prop_list):
        if mol is not None:
            mol.SetProp("target", str(prop))
            mol_list_filtered.append(mol)
    print("{} Molecules Optimized".format(len(mol_list_filtered)))

    # Save molecules
    print("Saving File...")
    w = Chem.SDWriter(target_path)
    for m in mol_list_filtered:
        w.write(m)
    print("Saved {} Molecules to {}".format(len(mol_list_filtered), target_path)) 
Example #15
Source File: sparse_molecular_dataset.py    From MolGAN with MIT License 4 votes vote down vote up
def generate(self, filename, add_h=False, filters=lambda x: True, size=None, validation=0.1, test=0.1):
        self.log('Extracting {}..'.format(filename))

        if filename.endswith('.sdf'):
            self.data = list(filter(lambda x: x is not None, Chem.SDMolSupplier(filename)))
        elif filename.endswith('.smi'):
            self.data = [Chem.MolFromSmiles(line) for line in open(filename, 'r').readlines()]

        self.data = list(map(Chem.AddHs, self.data)) if add_h else self.data
        self.data = list(filter(filters, self.data))
        self.data = self.data[:size]

        self.log('Extracted {} out of {} molecules {}adding Hydrogen!'.format(len(self.data),
                                                                              len(Chem.SDMolSupplier(filename)),
                                                                              '' if add_h else 'not '))

        self._generate_encoders_decoders()
        self._generate_AX()

        # it contains the all the molecules stored as rdkit.Chem objects
        self.data = np.array(self.data)

        # it contains the all the molecules stored as SMILES strings
        self.smiles = np.array(self.smiles)

        # a (N, L) matrix where N is the length of the dataset and each L-dim vector contains the 
        # indices corresponding to a SMILE sequences with padding wrt the max length of the longest 
        # SMILES sequence in the dataset (see self._genS)
        self.data_S = np.stack(self.data_S)

        # a (N, 9, 9) tensor where N is the length of the dataset and each 9x9 matrix contains the 
        # indices of the positions of the ones in the one-hot representation of the adjacency tensor
        # (see self._genA)
        self.data_A = np.stack(self.data_A)

        # a (N, 9) matrix where N is the length of the dataset and each 9-dim vector contains the 
        # indices of the positions of the ones in the one-hot representation of the annotation matrix
        # (see self._genX)
        self.data_X = np.stack(self.data_X)

        # a (N, 9) matrix where N is the length of the dataset and each  9-dim vector contains the 
        # diagonal of the correspondent adjacency matrix
        self.data_D = np.stack(self.data_D)

        # a (N, F) matrix where N is the length of the dataset and each F vector contains features 
        # of the correspondent molecule (see self._genF)
        self.data_F = np.stack(self.data_F)

        # a (N, 9) matrix where N is the length of the dataset and each  9-dim vector contains the
        # eigenvalues of the correspondent Laplacian matrix
        self.data_Le = np.stack(self.data_Le)

        # a (N, 9, 9) matrix where N is the length of the dataset and each  9x9 matrix contains the 
        # eigenvectors of the correspondent Laplacian matrix
        self.data_Lv = np.stack(self.data_Lv) 

        self.vertexes = self.data_F.shape[-2]
        self.features = self.data_F.shape[-1]

        self._generate_train_validation_test(validation, test)