python source code of pyanitools

deepchem-master
- devtools
  - conda-recipe
    - deepchem
      - run_test.py
      - conda_build_config.yaml
      - meta.yaml
      - build.sh
    - mdtraj
      - meta.yaml
  - jenkins
    - table_to_csv.py
    - molnet_update.sh
    - results.table
    - convert_to_rst.py
    - test_examples.sh
    - Readme.md
    - test_notebooks.sh
    - build_and_upload_docs.sh
    - conda_build.sh
    - generate_graph.py
    - push-docs-to-s3.py
    - compare_results.py
    - desired_results.csv
    - jenkins.sh
  - travis-ci
    - pre-commit
    - test_format_code.sh
  - README.md
- datasets
  - tox21.csv.gz
  - Positive Modulators Summary_ 918.TUC _ v1.xlsx
  - mini_muv.csv.gz
  - crystal_desc_canvas_aug30.csv
  - medium_muv.csv.gz
  - flash_points.csv.gz
  - gdb1k.sdf.csv
  - delaney-processed.csv
  - construct_pdbbind_df.py
  - membrane_permeability.sdf.csv
- .coveragerc
- examples
  - qm7
    - qm7_splits.csv
    - qm7_sklearn.py
    - get_qm7.sh
    - qm7b_tf_model.py
    - qm7b_DTNN.py
    - README.md
    - qm7_tf_model.py
    - qm7_tensorgraph_GraphConv.py
    - qm7_tensorgraph_DTNN.py
  - hyperparam_opt
    - grid_hyperparam_opt.py
    - gaussian_hyperparam_opt_with_logdir.py
    - gaussian_hyperparam_opt.py
    - README.md
  - transformers
    - README.md
  - datasets
    - scaffold_split_print.py
    - README.md
    - pretty_print.py
  - binding_pockets
    - binding_pocket_rf.py
    - binding_pocket_datasets.py
    - README.md
  - qm8
    - qm8_DTNN.py
    - qm8_tf_model.py
    - get_qm8.sh
    - qm8_MPNN.py
  - pdbbind
    - pdbbind_atomic_conv.py
    - get_featurized_pdbbind.sh
    - pdbbind_rf.py
    - get_pdbbind.sh
    - pdbbind_tf.py
    - .gitignore
  - encoders
    - encoder_example.py
    - __init__.py
  - uv
    - UV_tf_singletask.py
    - UV_datasets.py
    - UV_tf_model.py
    - uv_features.py
    - UV_rf_model.py
    - UV_correlations.py
    - README.md
    - UV_tf_progressive.py
    - UV_tf_robust.py
  - sweetlead
    - sweet.py
    - sweetlead_datasets.py
    - README.md
  - adme
    - run_benchmarks.py
    - __init__.py
    - README.md
  - nci
    - nci_rf.py
    - nci_datasets.py
  - benchmark_low_data.py
  - chembl
    - chembl_tf_models.py
    - chembl_graph_conv.py
    - year_sets
      - chembl_5thresh_ts_valid.csv.gz
    - chembl_tasks.py
    - __init__.py
  - muv
    - muv_tf.py
    - README.md
    - muv_sklearn.py
  - model_restore
    - chemception_restore.py
    - chemception_model.py
    - README.md
    - .gitignore
  - kaggle
    - KAGGLE_tf_robust.py
    - get_kaggle.sh
    - KAGGLE_tf_model.py
    - KAGGLE_tf_singletask.py
    - README.md
    - KAGGLE_tf_progressive.py
    - kaggle_features.py
    - KAGGLE_rf_model.py
  - pretraining
    - fcnet_pretraining.py
    - README.md
  - pcba
    - pcba_tf.py
    - pcba_sklearn.py
    - pcba_datasets.py
    - README.md
  - tox21
    - tox21_IRV.py
    - tox21_KernelSVM.py
    - tox21_tensorgraph_weave.py
    - tox21_tensorgraph_DAG.py
    - tox21_logreg.py
    - tox21_robustMT_models.py
    - tox21_tf_progressive.py
    - tox21_sklearn_models.py
    - tox21_graphcnn.py
    - tox21_tensorgraph_graphconv_sluice.py
    - README.md
    - tox21_fcnet.py
    - tox21_torch.py
    - tox21_tensorgraph_graph_conv.py
  - notebooks
    - TensorGraph_Mechanics.ipynb
    - deepchem_tensorflow_eager.ipynb
    - tests.py
    - README.md
    - .gitignore
    - assets
  - roitberg
    - app.py
    - pyanitools.py
    - roitberg.py
  - tutorials
    - 19_Large_Scale_Chemical_Screens.ipynb
    - 07_Uncertainty_In_Deep_Learning.ipynb
    - 17_Training_a_Generative_Adversarial_Network_on_MNIST.ipynb
    - 16_Conditional_Generative_Adversarial_Networks.ipynb
    - 02_Learning_MNIST_Digit_Classifiers.ipynb
    - 06_Going_Deeper_on_Molecular_Featurizations.ipynb
    - 05_Putting_Multitask_Learning_to_Work.ipynb
    - 21_Introduction_to_Bioinformatics.ipynb
    - 09_Creating_a_high_fidelity_model_from_experimental_data.ipynb
    - 10_Exploring_Quantum_Chemistry_with_GDB1k.ipynb
    - 15_Synthetic_Feasibility_Scoring.ipynb
    - 04_Introduction_to_Graph_Convolutions.ipynb
    - 14_Modeling_Protein_Ligand_Interactions_With_Atomic_Convolutions.ipynb
    - basic_graphs.gif
    - README.md
    - WIP_20_Converting_DeepChem_Models_to_TensorFlow_Estimators.ipynb
    - 11_Learning_Unsupervised_Embeddings_for_Molecules.ipynb
    - 18_Using_Reinforcement_Learning_to_Play_Pong.ipynb
    - 01_The_Basic_Tools_of_the_Deep_Life_Sciences.ipynb
  - splitters
    - random_split.py
    - README.md
    - scaffold_split.py
  - qm9
    - qm9_tf_model.py
    - get_qm9.sh
    - qm9_DTNN.py
  - stable_results.csv
  - kinase
    - KINASE_rf_model.py
    - kinase_features.py
    - KINASE_datasets.py
    - KINASE_tf_model.py
    - KINASE_tf_robust.py
    - README.md
    - KINASE_tf_singletask.py
    - KINASE_correlations.py
    - KINASE_tf_progressive.py
  - data_loading
    - membrane_permeability.sdf
    - pandas_csv.py
    - sdf_load.py
    - README.md
    - example.csv
  - hiv
    - hiv_irv.py
    - hiv_tf_models.py
    - README.md
  - factors
    - FACTORS_rf_model.py
    - FACTORS_correlations.py
    - FACTORS_tf_singletask.py
    - FACTORS_datasets.py
    - factors_features.py
    - FACTORS_tf_model.py
    - FACTORS_tf_robust.py
    - FACTORS_tf_progressive.py
    - README.md
  - hopv
    - hopv_tf_progressive.py
    - get_hopv.sh
    - hopv_sklearn_models.py
    - README.md
    - hopv_robustMT_models.py
    - hopv_tf_models.py
    - hopv_graph_conv.py
  - sider
    - sider.csv.gz
    - sider_rf.py
    - sider_datasets.py
  - sampl
    - sampl_tf_models.py
    - sampl_graph_conv.py
    - SAMPL.csv
  - membrane_permeability
    - membrane_permeability_datasets.py
    - membrane_permeability_graph_conv.py
    - __init__.py
  - benchmark_curve.py
  - benchmark.py
  - low_data
    - muv_graph_conv_one_fold.py
    - tox_graph_conv_one_fold.py
    - tox_rf_one_fold.py
    - tox_rf_K_fold.py
    - sider_graph_conv_one_fold.py
    - toxcast_maml.py
    - datasets.py
    - muv_rf_one_fold.py
    - sider_rf_one_fold.py
    - __init__.py
  - delaney
    - delaney_graph_conv.py
    - delaney_DAG.py
    - delaney_krr.py
    - delaney_torch.py
    - delaney_tf_models.py
    - delaney_chemception.py
    - delaney_graphconv_error_bars.py
    - delaney_MPNN.py
    - delaney_tf_progressive.py
    - README.md
    - delaney_weave.py
    - delaney_textcnn.py
  - bace
    - bace_datasets.py
    - bace_rf.py
  - toxcast
    - toxcast_rf.py
    - processing
      - tox.py
    - toxcast_datasets.py
    - README.md
  - clintox
    - clintox_tf_models.py
    - datasets
      - aacttox
        smiles_cache.csv.gz
        aacttox.py
        aacttox.csv.gz
        aacttox_phase_multiclass.csv.gz
      - aacttox_sweetfda_join.py
      - clintox.csv.gz
      - sweetfda
        sweetfda_approved_processed.csv.gz
    - __init__.py
    - clintox_graph_conv.py
- LICENSE
- .readthedocs.yml
- CONTRIBUTING.md
- deepchem
  - splits
    - splitters.py
    - test_specified_index_splitter.py
    - task_splitter.py
    - test_scaffold_splitter.py
    - __init__.py
    - tests
      - test_splitter.py
      - test_task_splitter.py
      - __init__.py
  - trans
    - transformers.py
    - __init__.py
    - tests
      - test_transformers.py
      - __init__.py
  - feat
    - mol_graphs.py
    - basic.py
    - fingerprints.py
    - graph_features.py
    - smiles_featurizers.py
    - raw_featurizer.py
    - materials_featurizers.py
    - one_hot.py
    - adjacency_fingerprints.py
    - __init__.py
    - rdkit_grid_featurizer.py
    - atomic_coordinates.py
    - tests
      - test_materials_featurizers.py
      - test_basic.py
      - test_smiles_featurizers.py
      - test_sdf_reader.py
      - test_coulomb_matrices.py
      - test_graph_features.py
      - 3ws9_ligand.sdf
      - test_mol_graphs.py
      - data
        3zp9_ligand_hyd.pdbqt
        3bwf_ligand_hyd.pdb
        chembl_25_small.csv
        3zp9_ligand_hyd.pdb
        water.sdf.csv
        3zso_ligand_hyd.pdb
        3zp9_protein_hyd.pdbqt
        3bwf_ligand_hyd.pdbqt
        water.sdf
      - test_features.py
      - test_fingerprints.py
      - __init__.py
      - test_binding_pocket_features.py
      - test_convmol.py
      - test_atomic_coordinates.py
      - test_one_hot.py
      - test_rdkit_grid_features.py
    - coulomb_matrices.py
    - base_classes.py
    - binding_pocket_features.py
  - utils
    - genomics.py
    - conformers.py
    - pdbqt_utils.py
    - fragment_util.py
    - voxel_utils.py
    - evaluate.py
    - rdkit_util.py
    - save.py
    - coordinate_box_utils.py
    - test
      - test_vina_utils.py
      - test_rdkit_util.py
      - test_seq.py
      - 1jld_ligand_docked.pdbqt
      - test_geometry_utils.py
      - test_hash_utils.py
      - data
        example.fasta
        example.fastq
      - test_fragment_util.py
      - __init__.py
      - test_voxel_utils.py
      - test_coordinate_box_utils.py
      - test_pdbqt_utils.py
      - test_generator_evaluator.py
    - hash_utils.py
    - geometry_utils.py
    - vina_utils.py
    - __init__.py
    - mol_xyz_util.py
  - metalearning
    - maml.py
    - __init__.py
    - tests
      - test_maml.py
  - models
    - fcnet.py
    - chemnet_layers.py
    - sklearn_models
      - __init__.py
    - IRV.py
    - keras_model.py
    - optimizers.py
    - robust_multitask.py
    - graph_models.py
    - xgboost_models
      - __init__.py
    - chemnet_models.py
    - text_cnn.py
    - gan.py
    - losses.py
    - multitask.py
    - models.py
    - cnn.py
    - atomic_conv.py
    - layers.py
    - __init__.py
    - seqtoseq.py
    - scscore.py
    - tests
      - test_multitask.py
      - multitask_example.csv
      - test_gan.py
      - test_callbacks.py
      - test_cnn.py
      - sparse_multitask_example.csv
      - user_specified_example.csv
      - test_optimizers.py
      - gaussian_cdf_example.csv
      - test_graph_models.py
      - test_atomic_conv.py
      - test_chemnet_models.py
      - test_generalize.py
      - butina_example.csv
      - test_overfit.py
      - example_DTNN.mat
      - test_kerasmodel.py
      - example_classification.csv
      - example_regression.csv
      - chembl_25_small.csv
      - feat_multitask_example.csv
      - test_layers.py
      - test_textcnnmodel.py
      - test_seqtoseq.py
      - test_reload.py
      - __init__.py
      - test_pretrained.py
      - test_layers_from_config.py
      - test_scscore.py
      - test_api.py
      - test_predict.py
      - test_singletask_to_multitask.py
      - example.csv
    - callbacks.py
    - progressive_multitask.py
  - molnet
    - load_function
      - bace_datasets.py
      - sampl_datasets.py
      - qm7_datasets.py
      - qm9_datasets.py
      - chembl25_datasets.py
      - cell_counting_datasets.py
      - bbbc_datasets.py
      - chembl_datasets.py
      - kinase_datasets.py
      - bbbp_datasets.py
      - toxcast_datasets.py
      - delaney_datasets.py
      - pcba_datasets.py
      - bace_features.py
      - hppb_datasets.py
      - clintox_datasets.py
      - tox21_datasets.py
      - uspto_datasets.py
      - sweetlead_datasets.py
      - hiv_datasets.py
      - lipo_datasets.py
      - muv_datasets.py
      - factors_datasets.py
      - chembl_tasks.py
      - __init__.py
      - nci_datasets.py
      - uv_datasets.py
      - kaggle_datasets.py
      - thermosol_datasets.py
      - clearance_datasets.py
      - pdbbind_datasets.py
      - kaggle_features.py
      - uv_tasks.py
      - qm8_datasets.py
      - sider_datasets.py
      - ppb_datasets.py
      - hopv_datasets.py
    - run_benchmark_low_data.py
    - preset_hyper_parameters.py
    - run_benchmark_models.py
    - dnasim.py
    - run_benchmark.py
    - __init__.py
    - check_availability.py
    - tests
      - test_molnet.py
      - test_dnasim.py
      - __init__.py
  - rl
    - ppo.py
    - __init__.py
    - tests
      - test_ppo.py
      - test_a2c.py
    - a2c.py
    - envs
      - tictactoe.py
      - test_tictactoe.py
      - __init__.py
  - data
    - supports.py
    - data_loader.py
    - datasets.py
    - __init__.py
    - tests
      - no_labels.csv
      - test_csv_loader.py
      - test_drop.py
      - a_image.tif
      - mini_emols.csv
      - images
      - test_fasta_loader.py
      - test_image_dataset.py
      - test_load.py
      - example.fasta
      - test_data_loader.py
      - test_support_generator.py
      - test_shuffle.py
      - test_merge.py
      - test_reload.py
      - __init__.py
      - test_image_loader.py
      - test_datasets.py
  - __init__.py
  - metrics
    - genomic_metrics.py
    - __init__.py
    - tests
      - metrics_test.py
      - test_genomics.py
      - __init__.py
  - dock
    - pose_generation.py
    - pose_scoring.py
    - binding_pocket.py
    - __init__.py
    - tests
      - 1jld_ligand.sdf
      - test_pose_scoring.py
      - test_binding_pocket.py
      - test_pose_generation.py
      - __init__.py
      - test_docking.py
    - docking.py
  - hyper
    - gaussian_process.py
    - grid_search.py
    - __init__.py
    - tests
      - test_hyperparam_opt.py
      - test_gaussian_hyperparam_opt.py
      - __init__.py
      - test_grid_hyperparam_opt.py
    - base_classes.py
- ISSUE_TEMPLATE.md
- setup.py
- docker
  - master
    - Dockerfile
  - conda-forge
    - Dockerfile
- .style.yapf
- .travis.yml
- README.md
- CODE_OF_CONDUCT.md
- pytest.ini
- scripts
  - install_deepchem_conda.sh
  - colab_install.py
  - install_deepchem_conda.ps1
- .gitignore
- docs
  - metalearning.rst
  - moleculenet.rst
  - datasets.rst
  - Makefile
  - source
    - conf.py
  - _config.yml
  - metrics.rst
  - layers.rst
  - sphinxext
    - notebook_sphinxext.py
  - dataloaders.rst
  - splitters.rst
  - transformers.rst
  - hyper.rst
  - utils.rst
  - docking.rst
  - rl.rst
  - README.md
  - requirements.txt
  - index.rst
  - conf.py
  - models.rst
  - featurizers.rst
  - _static
    - theme_overrides.css
  - installation.rst
- contrib
  - mol2vec
    - eval_mol2vec_results.py
    - train_mol2vec.sh
    - README.md
    - mol2vec.py
  - one_shot_models
    - multitask_classifier.py
    - examples
      - muv_siamese_one_fold.py
      - tox_attn_one_fold.py
      - sider_attn_one_fold.py
      - sider_from_tox21_attn_one_fold.py
      - sider_from_tox21_siamese_one_fold.py
      - sider_siamese_one_fold.py
      - sider_from_tox21_res_one_fold.py
      - sider_alternate_weave.py
      - tox_res_one_fold.py
      - muv_res_one_fold.py
      - muv_attn_one_fold.py
      - tox_siamese_one_fold.py
      - tox21_alternate_weave.py
      - sider_res_one_fold.py
    - graph_models.py
    - support_classifier.py
    - multitask_regressor.py
    - tests
      - 1jld_ligand.pdb
      - test_graph_models.py
      - 1jld_ligand.sdf
      - test_graph_topology.py
      - __init__.py
    - graph_topology.py
  - pubchem_dataset
    - create_smiles_mapping.py
    - download_pubchem_ftp.py
    - README.md
    - create_assay_overview.py
  - DeepMHC
    - deepmhc.py
    - bd13_datasets.py
    - run_deepmhc.py
  - autoencoder_models
    - model.py
    - test_tensorflowEncoders.py
    - __init__.py
    - autoencoder.py
    - .gitignore
  - dragonn
    - tutorial_images
    - GTC_workshop_tutorial.ipynb
    - tutorial_utils.py
    - models.py
    - simulations.py
    - utils.py
  - nn
    - constraints.py
    - weave_layers.py
    - copy.py
    - objectives.py
    - layers.py
    - __init__.py
    - tests
      - test_layers.py
  - hagcn
    - run_model.py
    - hagcn_model.py
    - hagcn_layers.py
  - rl
    - tictactoe.py
    - mcts.py
    - test_mcts.py
  - tensorflow_models
    - test_utils.py
    - robust_multitask.py
    - progressive_joint.py
    - __init__.py
    - utils.py
    - test_progressive.py
    - deepchem_multitask_classifer_distributed_training_example.py
    - progressive_multitask.py
  - torch
    - torch_multitask_regression.py
    - pytorch_graphconv.py
    - examples
      - tox21_pytorch_graphconv.ipynb
    - torch_model.py
    - torch_multitask_classification.py
  - mpnn
    - mpnn.py
    - README.md
    - donkey.py
  - DiabeticRetinopathy
    - data.py
    - run.py
    - model.py
  - vina_model
    - 1jld_ligand.pdb
    - test_vina_model.py
    - 1jld_ligand.sdf
    - vina_model.py
  - laplacian
    - petroskisuch.py
  - visualization
    - utils.py
  - atomicconv
    - splits
      - pdbbind_random_split.py
      - splitters.py
      - pdbbind_temporal_split.py
      - pdbbind_stratified_split.py
      - pdbbind_scaffold_split.py
      - .gitignore
    - feat
      - atomicnet_coordinates.py
      - featurize.py
      - convert_ligand_sdf_to_pdb.sh
      - atomicnet_pdbbind_datasets.py
    - acnn
      - refined
        opt_scaffold.py
        opt_temporal.py
        opt_random.py
        tensor_graph_hyper_param_eval.py
        get_acnn_refined.sh
        opt_stratified.py
        .gitignore
      - core
        get_acnn_core.sh
        opt_scaffold.py
        opt_temporal.py
        opt_random.py
        tensor_graph_hyper_param_search.py
        tensor_graph_hyper_param_eval.py
        opt_stratified.py
        opt_random_tensorgraph.py
    - models
      - atomicnet_ops.py
      - atomicnet.py
      - legacy.py
- MANIFEST.in

# Written by Roman Zubatyuk and Justin S. Smith
# Modified by Yutong Zhao to make python2 compatible
import h5py
import numpy as np
import platform
import os

PY_VERSION = int(platform.python_version().split('.')[0]) > 3


class datapacker(object):

  def __init__(self, store_file, mode='w-', complib='gzip', complevel=6):
    """Wrapper to store arrays within HFD5 file
        """
    # opening file
    self.store = h5py.File(store_file, mode=mode)
    self.clib = complib
    self.clev = complevel

  def store_data(self, store_loc, **kwargs):
    """Put arrays to store
        """
    #print(store_loc)
    g = self.store.create_group(store_loc)
    for k, v, in kwargs.items():
      #print(type(v[0]))

      #print(k)
      if type(v) == list:
        if len(v) != 0:
          if type(v[0]) is np.str_ or type(v[0]) is str:
            v = [a.encode('utf8') for a in v]

      g.create_dataset(
          k, data=v, compression=self.clib, compression_opts=self.clev)

  def cleanup(self):
    """Wrapper to close HDF5 file
        """
    self.store.close()


class anidataloader(object):
  ''' Contructor '''

  def __init__(self, store_file):
    if not os.path.exists(store_file):
      exit('Error: file not found - ' + store_file)
    self.store = h5py.File(store_file)

  ''' Group recursive iterator (iterate through all groups in all branches and return datasets in dicts) '''

  def h5py_dataset_iterator(self, g, prefix=''):
    for key in g.keys():
      item = g[key]
      path = '{}/{}'.format(prefix, key)
      keys = [i for i in item.keys()]
      if isinstance(item[keys[0]], h5py.Dataset):  # test for dataset
        data = {'path': path}
        for k in keys:
          if not isinstance(item[k], h5py.Group):
            dataset = np.array(item[k].value)

            if type(dataset) is np.ndarray:
              if dataset.size != 0:
                if type(dataset[0]) is np.bytes_:
                  dataset = [a.decode('ascii') for a in dataset]

            data.update({k: dataset})

        yield data
      else:  # test for group (go down)
        for s in self.h5py_dataset_iterator(item, path):
          yield s

  ''' Default class iterator (iterate through all data) '''

  def __iter__(self):
    for data in self.h5py_dataset_iterator(self.store):
      yield data

  ''' Returns a list of all groups in the file '''

  def get_group_list(self):
    return [g for g in self.store.values()]

  ''' Allows interation through the data in a given group '''

  def iter_group(self, g):
    for data in self.h5py_dataset_iterator(g):
      yield data

  ''' Returns the requested dataset '''

  def get_data(self, path, prefix=''):
    item = self.store[path]
    path = '{}/{}'.format(prefix, path)
    keys = [i for i in item.keys()]
    data = {'path': path}
    # print(path)
    for k in keys:
      if not isinstance(item[k], h5py.Group):
        dataset = np.array(item[k].value)

        if type(dataset) is np.ndarray:
          if dataset.size != 0:
            if type(dataset[0]) is np.bytes_:
              dataset = [a.decode('ascii') for a in dataset]

        data.update({k: dataset})
    return data

  ''' Returns the number of groups '''

  def group_size(self):
    return len(self.get_group_list())

  def size(self):
    count = 0
    for g in self.store.values():
      count = count + len(g.items())
    return count

  ''' Close the HDF5 file '''

  def cleanup(self):
    self.store.close()


if __name__ == "__main__":
  base_dir = os.environ["ROITBERG_ANI"]

  # Number of conformations in each file increases exponentially.
  # Start with a smaller dataset before continuing. Use all of them
  # for production
  hdf5files = [
      'ani_gdb_s01.h5', 'ani_gdb_s02.h5', 'ani_gdb_s03.h5', 'ani_gdb_s04.h5',
      'ani_gdb_s05.h5', 'ani_gdb_s06.h5', 'ani_gdb_s07.h5', 'ani_gdb_s08.h5'
  ]

  hdf5files = [os.path.join(base_dir, f) for f in hdf5files]

  for hdf5file in hdf5files:
    print("processing", hdf5file)
    adl = anidataloader(hdf5file)
    for data in adl:

      # Extract the data
      P = data['path']
      R = data['coordinates']
      E = data['energies']
      S = data['species']
      smi = data['smiles']