python source code of preprocess

from collections import defaultdict

import numpy as np

from scipy import spatial

import torch


def create_atoms(atoms, atom_dict):
    """Transform the atom types in a molecule (e.g., H, C, and O)
    into the indices (e.g., H=0, C=1, and O=2).
    """
    atoms = [atom_dict[a] for a in atoms]
    return np.array(atoms)


def create_distances(coords):
    """Create the distance matrix from a set of 3D coordinates.
    Note that we transform the element 0.0 in the matrix into a large value
    for processing by Gaussian exp(-d^2), where d is the distance.
    """
    distance_matrix = spatial.distance_matrix(coords, coords)
    return np.where(distance_matrix == 0.0, 1e6, distance_matrix)


def split_dataset(dataset, ratio):
    """Shuffle and split a dataset."""
    np.random.seed(1234)  # fix the seed for shuffle.
    np.random.shuffle(dataset)
    n = int(ratio * len(dataset))
    return dataset[:n], dataset[n:]


def create_datasets(dataset, physical_property, device):

    dir_dataset = '../dataset/' + dataset + '/'

    """Initialize atom_dict, in which
    each key is an atom type and each value is its index.
    """
    atom_dict = defaultdict(lambda: len(atom_dict))

    def create_dataset(filename):

        print(filename)

        """Load a dataset."""
        with open(dir_dataset + filename, 'r') as f:
            property_types = f.readline().strip().split()
            data_original = f.read().strip().split('\n\n')

        """The physical_property is an energy, HOMO, or LUMO."""
        property_index = property_types.index(physical_property)

        dataset = []

        for data in data_original:

            data = data.strip().split('\n')
            idx = data[0]
            property = float(data[-1].split()[property_index])

            """Load the atoms and their coordinates of a molecular data."""
            atoms, atom_coords = [], []
            for atom_xyz in data[1:-1]:
                atom, x, y, z = atom_xyz.split()
                atoms.append(atom)
                xyz = [float(v) for v in [x, y, z]]
                atom_coords.append(xyz)

            """Create each data with the above defined functions."""
            atoms = create_atoms(atoms, atom_dict)
            distance_matrix = create_distances(atom_coords)
            molecular_size = len(atoms)

            """Transform the above each data of numpy
            to pytorch tensor on a device (i.e., CPU or GPU).
            """
            atoms = torch.LongTensor(atoms).to(device)
            distance_matrix = torch.FloatTensor(distance_matrix).to(device)
            property = torch.FloatTensor([[property]]).to(device)

            dataset.append((atoms, distance_matrix, molecular_size, property))

        return dataset

    dataset_train = create_dataset('data_train.txt')
    dataset_train, dataset_dev = split_dataset(dataset_train, 0.9)
    dataset_test = create_dataset('data_test.txt')

    N_atoms = len(atom_dict)

    return dataset_train, dataset_dev, dataset_test, N_atoms