python source code of load_lowe_examples_into_db

## Meant to use with Daniel Lowe's patent database
# reaction SMILES strings only (for now)

from __future__ import print_function
import argparse
from numpy.random import shuffle # for random selection
from numpy.random import random
import rdkit.Chem as Chem          # molecule building
from rdkit.Chem import AllChem
from collections import defaultdict
import rdkit.Chem.Draw as Draw
from rdkit import RDLogger
import datetime # for info files
import json # for dumping
import sys  # for commanad line
import os   # for file paths
import re 
import itertools
from xml.dom import minidom

from ochem_predict_nn.utils.database import collection_example_reactions_details
collection = collection_example_reactions_details()

def doc_to_dic(document):
	'''Converts a parsed XML file to a dictionary using relevant fields'''
	dic = {}
	dic['reaction_smiles'] = \
		str(document.getElementsByTagName('dl:reactionSmiles')[0].firstChild.nodeValue)
	products = []
	for node in document.getElementsByTagName('product'):
		products.append(mol_to_dic(node, withAmounts = True))
	reactants = []
	for node in document.getElementsByTagName('reactant'):
		reactants.append(mol_to_dic(node))
	catalysts = []; solvents = []; spectators = [];
	for node in document.getElementsByTagName('spectator'):
		role = node.attributes.getNamedItem('role').value
		if role == 'catalyst':
			catalysts.append(mol_to_dic(node))
		elif role == 'solvent':
			solvents.append(mol_to_dic(node))
		else:
			raise ValueError('Unknown spectator role: {}'.format(role))
	if not reactants or not products:
		raise ValueError('No reactants/products?')
	dic['reactants'] = reactants
	dic['products'] = products 
	if catalysts: dic['catalysts'] = catalysts 
	if solvents: dic['solvents'] = solvents 
	if spectators: dic['spectators'] = spectators

	# Make yield more accessible
	if sum(['yield' in product for product in products]) == 1:
		dic['yield'] = [product['yield'] for product in products if 'yield' in product][0]

	return dic

def mol_to_dic(node, withAmounts = False):
	'''Converts a node containing molecule information into a
	dictionary'''
	dic = {}
	# Get name
	dic['name'] = str(node.getElementsByTagName('name')[0].firstChild.nodeValue)
	# If exact entity match, more data is available
	#print(node.toprettyxml())
	#entityType = node.getElementsByTagName('dl:entityType')[0].firstChild.nodeValue
	#if entityType == 'exact' or entityType == 'definiteReference':
	identifiers = {
		child.attributes.getNamedItem('dictRef').value : \
		child.attributes.getNamedItem('value').value \
		for child in node.getElementsByTagName('identifier')
	}
	if 'cml:inchi' in identifiers.keys():
		mol = AllChem.MolFromInchi(str(identifiers['cml:inchi']))
	elif 'cml:smiles' in identifiers.keys():
		mol = AllChem.MolFromSmiles(str(identifiers['cml:smiles']))
	else:
		print('identifiers: {}'.format(identifiers.keys()))
		raise ValueError('No molecular identifier for {}'.format(dic['name']))
	if not mol: raise ValueError('Couldnt parse molecule: {}'.format(identifiers))

	Chem.SanitizeMol(mol)
	dic['smiles'] = AllChem.MolToSmiles(mol, isomericSmiles=True)
	dic['inchi'] = AllChem.MolToInchi(mol)
	# elif entityType == 'chemicalClass':
	# 	pass # name is all we get
	# else:
	# 	raise ValueError('Unknown entityType for molecule: {}'.format(entityType))
	# Quantity?
	if withAmounts:
		amounts = {
			child.attributes.getNamedItem('units').value : \
			child.firstChild.nodeValue \
			for child in node.getElementsByTagName('amount')
		}
		if 'unit:percentYield' in amounts.keys():
			dic['yield'] = float(amounts['unit:percentYield'])
		if 'unit:g' in amounts.keys():
			dic['amount(g)'] = float(amounts['unit:g'])
	return dic

def main(db_fpath, N = 15):
	'''Read reactions from Lowe's patent reaction SMILES'''

	try:
		# Open file
		file_generator = get_reaction_file(db_fpath)
		print(file_generator)
		documents = []
		for i, rxn in enumerate(file_generator):
			if i == N:
				break

			print('~~~~~~~ {} ~~~~~~'.format(i))
			print('{}: {}'.format(i, rxn))
			document = minidom.parse(rxn)
			try:
				dic = doc_to_dic(document)
				dic['random'] = random()
				documents.append(dic)
			except ValueError as e:
				print(e)

			# Report progress and insert every 1000
			if ((i+1) % 1000) == 0:
				print('{}/{}'.format(i+1, N))
				result = collection.insert(documents)
				documents = []

		if documents: result = collection.insert(documents)
	except KeyboardInterrupt:
		print('Stopped early!')		

	print('Created {} database entries'.format(collection.find().count()))

	return True


def get_reaction_file(fpath):
	'''This function is used to recursively traverse a directory and find
	all .cml files contained within. It is used as a generator.'''
	for dir, subdirs, files in os.walk(fpath):
		subdirs.sort(reverse = True)
		for subdir in subdirs:
			for p in get_reaction_file(subdir):
				yield p
		files.sort(reverse = True)
		for file in files:
			yield os.path.join(dir, file)

if __name__ == '__main__':

	parser = argparse.ArgumentParser()
	parser.add_argument('data_file', type = str, 
		 				help = 'File where each line is an atom-mapped smiles reaction')
	parser.add_argument('-n', '--num', type = int, default = 50,
						help = 'Maximum number of records to load; defaults to 50')
	args = parser.parse_args()

	clear = raw_input('Do you want to clear the {} existing examples? '.format(collection.find().count()))
	if clear in ['y', 'Y', 'yes', '1', 'Yes']:
		result = collection.delete_many({})
		print('Cleared {} entries from collection'.format(result.deleted_count))

	main(args.data_file, N = args.num)