""" Features - Main Mol2vec Module ============================== """ from tqdm import tqdm import numpy as np import pandas as pd from rdkit import Chem from rdkit.Chem import AllChem from rdkit.Chem import PandasTools from gensim.models import word2vec import timeit from joblib import Parallel, delayed class DfVec(object): """ Helper class to store vectors in a pandas DataFrame Parameters ---------- vec: np.array """ def __init__(self, vec): self.vec = vec if type(self.vec) != np.ndarray: raise TypeError('numpy.ndarray expected, got %s' % type(self.vec)) def __str__(self): return "%s dimensional vector" % str(self.vec.shape) __repr__ = __str__ def __len__(self): return len(self.vec) _repr_html_ = __str__ class MolSentence: """Class for storing mol sentences in pandas DataFrame """ def __init__(self, sentence): self.sentence = sentence if type(self.sentence[0]) != str: raise TypeError('List with strings expected') def __len__(self): return len(self.sentence) def __str__(self): # String representation return 'MolSentence with %i words' % len(self.sentence) __repr__ = __str__ # Default representation def contains(self, word): """Contains (and __contains__) method enables usage of "'Word' in MolSentence""" if word in self.sentence: return True else: return False __contains__ = contains # MolSentence.contains('word') def __iter__(self): # Iterate over words (for word in MolSentence:...) for x in self.sentence: yield x _repr_html_ = __str__ def mol2sentence(mol, radius): """Calculates ECFP (Morgan fingerprint) and returns identifiers of substructures as 'sentence' (string). Returns a tuple with 1) a list with sentence for each radius and 2) a sentence with identifiers from all radii combined. NOTE: Words are ALWAYS reordered according to atom order in the input mol object. NOTE: Due to the way how Morgan FPs are generated, number of identifiers at each radius is smaller Parameters ---------- mol : rdkit.Chem.rdchem.Mol radius : float Fingerprint radius Returns ------- identifier sentence List with sentences for each radius alternating sentence Sentence (list) with identifiers from all radii combined """ radii = list(range(int(radius) + 1)) info = {} _ = AllChem.GetMorganFingerprint(mol, radius, bitInfo=info) # info: dictionary identifier, atom_idx, radius mol_atoms = [a.GetIdx() for a in mol.GetAtoms()] dict_atoms = {x: {r: None for r in radii} for x in mol_atoms} for element in info: for atom_idx, radius_at in info[element]: dict_atoms[atom_idx][radius_at] = element # {atom number: {fp radius: identifier}} # iterate over all atoms and radii identifier_sentences = [] for r in radii: # iterate over radii to get one sentence per radius identifiers = [] for atom in dict_atoms: # iterate over atoms # get one sentence per radius identifiers.append(dict_atoms[atom][r]) identifier_sentences.append(list(map(str, [x for x in identifiers if x]))) # merge identifiers alternating radius to sentence: atom 0 radius0, atom 0 radius 1, etc. identifiers_alt = [] for atom in dict_atoms: # iterate over atoms for r in radii: # iterate over radii identifiers_alt.append(dict_atoms[atom][r]) alternating_sentence = map(str, [x for x in identifiers_alt if x]) return list(identifier_sentences), list(alternating_sentence) def mol2alt_sentence(mol, radius): """Same as mol2sentence() expect it only returns the alternating sentence Calculates ECFP (Morgan fingerprint) and returns identifiers of substructures as 'sentence' (string). Returns a tuple with 1) a list with sentence for each radius and 2) a sentence with identifiers from all radii combined. NOTE: Words are ALWAYS reordered according to atom order in the input mol object. NOTE: Due to the way how Morgan FPs are generated, number of identifiers at each radius is smaller Parameters ---------- mol : rdkit.Chem.rdchem.Mol radius : float Fingerprint radius Returns ------- list alternating sentence combined """ radii = list(range(int(radius) + 1)) info = {} _ = AllChem.GetMorganFingerprint(mol, radius, bitInfo=info) # info: dictionary identifier, atom_idx, radius mol_atoms = [a.GetIdx() for a in mol.GetAtoms()] dict_atoms = {x: {r: None for r in radii} for x in mol_atoms} for element in info: for atom_idx, radius_at in info[element]: dict_atoms[atom_idx][radius_at] = element # {atom number: {fp radius: identifier}} # merge identifiers alternating radius to sentence: atom 0 radius0, atom 0 radius 1, etc. identifiers_alt = [] for atom in dict_atoms: # iterate over atoms for r in radii: # iterate over radii identifiers_alt.append(dict_atoms[atom][r]) alternating_sentence = map(str, [x for x in identifiers_alt if x]) return list(alternating_sentence) def _parallel_job(mol, r): """Helper function for joblib jobs """ if mol is not None: smiles = Chem.MolToSmiles(mol) mol = Chem.MolFromSmiles(smiles) sentence = mol2alt_sentence(mol, r) return " ".join(sentence) def _read_smi(file_name): while True: line = file_name.readline() if not line: break yield Chem.MolFromSmiles(line.split('\t')[0]) def generate_corpus(in_file, out_file, r, sentence_type='alt', n_jobs=1): """Generates corpus file from sdf Parameters ---------- in_file : str Input sdf out_file : str Outfile name prefix, suffix is either _r0, _r1, etc. or _alt_r1 (max radius in alt sentence) r : int Radius of morgan fingerprint sentence_type : str Options: 'all' - generates all corpus files for all types of sentences, 'alt' - generates a corpus file with only combined alternating sentence, 'individual' - generates corpus files for each radius n_jobs : int Number of cores to use (only 'alt' sentence type is parallelized) Returns ------- """ # File type detection in_split = in_file.split('.') if in_split[-1].lower() not in ['sdf', 'smi', 'ism', 'gz']: raise ValueError('File extension not supported (sdf, smi, ism, sdf.gz, smi.gz)') gzipped = False if in_split[-1].lower() == 'gz': gzipped = True if in_split[-2].lower() not in ['sdf', 'smi', 'ism']: raise ValueError('File extension not supported (sdf, smi, ism, sdf.gz, smi.gz)') file_handles = [] # write only files which contain corpus if (sentence_type == 'individual') or (sentence_type == 'all'): f1 = open(out_file+'_r0.corpus', "w") f2 = open(out_file+'_r1.corpus', "w") file_handles.append(f1) file_handles.append(f2) if (sentence_type == 'alt') or (sentence_type == 'all'): f3 = open(out_file, "w") file_handles.append(f3) if gzipped: import gzip if in_split[-2].lower() == 'sdf': mols_file = gzip.open(in_file, mode='r') suppl = Chem.ForwardSDMolSupplier(mols_file) else: mols_file = gzip.open(in_file, mode='rt') suppl = _read_smi(mols_file) else: if in_split[-1].lower() == 'sdf': suppl = Chem.ForwardSDMolSupplier(in_file) else: mols_file = open(in_file, mode='rt') suppl = _read_smi(mols_file) if sentence_type == 'alt': # This can run parallelized result = Parallel(n_jobs=n_jobs, verbose=1)(delayed(_parallel_job)(mol, r) for mol in suppl) for i, line in enumerate(result): f3.write(str(line) + '\n') print('% molecules successfully processed.') else: for mol in suppl: if mol is not None: smiles = Chem.MolToSmiles(mol) mol = Chem.MolFromSmiles(smiles) identifier_sentences, alternating_sentence = mol2sentence(mol, r) identifier_sentence_r0 = " ".join(identifier_sentences[0]) identifier_sentence_r1 = " ".join(identifier_sentences[1]) alternating_sentence_r0r1 = " ".join(alternating_sentence) if len(smiles) != 0: if (sentence_type == 'individual') or (sentence_type == 'all'): f1.write(str(identifier_sentence_r0)+'\n') f2.write(str(identifier_sentence_r1)+'\n') if (sentence_type == 'alt') or (sentence_type == 'all'): f3.write(str(alternating_sentence_r0r1)+'\n') for fh in file_handles: fh.close() def _read_corpus(file_name): while True: line = file_name.readline() if not line: break yield line.split() def insert_unk(corpus, out_corpus, threshold=3, uncommon='UNK'): """Handling of uncommon "words" (i.e. identifiers). It finds all least common identifiers (defined by threshold) and replaces them by 'uncommon' string. Parameters ---------- corpus : str Input corpus file out_corpus : str Outfile corpus file threshold : int Number of identifier occurrences to consider it uncommon uncommon : str String to use to replace uncommon words/identifiers Returns ------- """ # Find least common identifiers in corpus f = open(corpus) unique = {} for i, x in tqdm(enumerate(_read_corpus(f)), desc='Counting identifiers in corpus'): for identifier in x: if identifier not in unique: unique[identifier] = 1 else: unique[identifier] += 1 n_lines = i + 1 least_common = set([x for x in unique if unique[x] <= threshold]) f.close() f = open(corpus) fw = open(out_corpus, mode='w') for line in tqdm(_read_corpus(f), total=n_lines, desc='Inserting %s' % uncommon): intersection = set(line) & least_common if len(intersection) > 0: new_line = [] for item in line: if item in least_common: new_line.append(uncommon) else: new_line.append(item) fw.write(" ".join(new_line) + '\n') else: fw.write(" ".join(line) + '\n') f.close() fw.close() def train_word2vec_model(infile_name, outfile_name=None, vector_size=100, window=10, min_count=3, n_jobs=1, method='skip-gram', **kwargs): """Trains word2vec (Mol2vec, ProtVec) model on corpus file extracted from molecule/protein sequences. The corpus file is treated as LineSentence corpus (one sentence = one line, words separated by whitespaces) Parameters ---------- infile_name : str Corpus file, e.g. proteins split in n-grams or compound identifier outfile_name : str Name of output file where word2vec model should be saved vector_size : int Number of dimensions of vector window : int Number of words considered as context min_count : int Number of occurrences a word should have to be considered in training n_jobs : int Number of cpu cores used for calculation method : str Method to use in model training. Options cbow and skip-gram, default: skip-gram) Returns ------- word2vec.Word2Vec """ if method.lower() == 'skip-gram': sg = 1 elif method.lower() == 'cbow': sg = 0 else: raise ValueError('skip-gram or cbow are only valid options') start = timeit.default_timer() corpus = word2vec.LineSentence(infile_name) model = word2vec.Word2Vec(corpus, size=vector_size, window=window, min_count=min_count, workers=n_jobs, sg=sg, **kwargs) if outfile_name: model.save(outfile_name) stop = timeit.default_timer() print('Runtime: ', round((stop - start)/60, 2), ' minutes') return model def remove_salts_solvents(smiles, hac=3): """Remove solvents and ions have max 'hac' heavy atoms. This function removes any fragment in molecule that has number of heavy atoms <= "hac" and it might not be an actual solvent or salt Parameters ---------- smiles : str SMILES hac : int Max number of heavy atoms Returns ------- str smiles """ save = [] for el in smiles.split("."): mol = Chem.MolFromSmiles(str(el)) if mol.GetNumHeavyAtoms() <= hac: save.append(mol) return ".".join([Chem.MolToSmiles(x) for x in save]) def sentences2vec(sentences, model, unseen=None): """Generate vectors for each sentence (list) in a list of sentences. Vector is simply a sum of vectors for individual words. Parameters ---------- sentences : list, array List with sentences model : word2vec.Word2Vec Gensim word2vec model unseen : None, str Keyword for unseen words. If None, those words are skipped. https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032 Returns ------- np.array """ keys = set(model.wv.vocab.keys()) vec = [] if unseen: unseen_vec = model.wv.word_vec(unseen) for sentence in sentences: if unseen: vec.append(sum([model.wv.word_vec(y) if y in set(sentence) & keys else unseen_vec for y in sentence])) else: vec.append(sum([model.wv.word_vec(y) for y in sentence if y in set(sentence) & keys])) return np.array(vec) def featurize(in_file, out_file, model_path, r, uncommon=None): """Featurize mols in SDF, SMI. SMILES are regenerated with RDKit to get canonical SMILES without chirality information. Parameters ---------- in_file : str Input SDF, SMI, ISM (or GZ) out_file : str Output csv model_path : str File path to pre-trained Gensim word2vec model r : int Radius of morgan fingerprint uncommon : str String to used to replace uncommon words/identifiers while training. Vector obtained for 'uncommon' will be used to encode new (unseen) identifiers Returns ------- """ # Load the model word2vec_model = word2vec.Word2Vec.load(model_path) if uncommon: try: word2vec_model[uncommon] except KeyError: raise KeyError('Selected word for uncommon: %s not in vocabulary' % uncommon) # File type detection in_split = in_file.split('.') f_type = in_split[-1].lower() if f_type not in ['sdf', 'smi', 'ism', 'gz']: raise ValueError('File extension not supported (sdf, smi, ism, sdf.gz, smi.gz)') if f_type == 'gz': if in_split[-2].lower() not in ['sdf', 'smi', 'ism']: raise ValueError('File extension not supported (sdf, smi, ism, sdf.gz, smi.gz)') else: f_type = in_split[-2].lower() print('Loading molecules.') if f_type == 'sdf': df = PandasTools.LoadSDF(in_file) print("Keeping only molecules that can be processed by RDKit.") df = df[df['ROMol'].notnull()] df['Smiles'] = df['ROMol'].map(Chem.MolToSmiles) else: df = pd.read_csv(in_file, delimiter='\t', usecols=[0, 1], names=['Smiles', 'ID']) # Assume <tab> separated PandasTools.AddMoleculeColumnToFrame(df, smilesCol='Smiles') print("Keeping only molecules that can be processed by RDKit.") df = df[df['ROMol'].notnull()] df['Smiles'] = df['ROMol'].map(Chem.MolToSmiles) # Recreate SMILES print('Featurizing molecules.') df['mol-sentence'] = df.apply(lambda x: MolSentence(mol2alt_sentence(x['ROMol'], r)), axis=1) vectors = sentences2vec(df['mol-sentence'], word2vec_model, unseen=uncommon) df_vec = pd.DataFrame(vectors, columns=['mol2vec-%03i' % x for x in range(vectors.shape[1])]) df_vec.index = df.index df = df.join(df_vec) df.drop(['ROMol', 'mol-sentence'], axis=1).to_csv(out_file)