# -*- coding: utf-8 -*- import os import sys import glob import logging import argparse import pickle from configparser import ConfigParser from pyscenic.utils import load_from_yaml from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase from pyscenic.prune import prune2df from dask.diagnostics import ProgressBar from cytoolz import mapcat from multiprocessing import cpu_count CONFIG_FILENAME = os.path.join(os.path.dirname(__file__), "hpc-prune.ini") # Setting the root logger for this entire project. LOGGER = logging.getLogger(os.path.splitext(__name__)[0]) def create_logging_handler(debug: bool) -> logging.Handler: """ Create a handler for logging purposes. :param debug: Does debug information need to be logged? :return: The logging handler. """ # By default stderr is used as the stream for logging. ch = logging.StreamHandler(stream=sys.stderr) # Logging level DEBUG is less severe than INFO. Therefore when the logging level is set # to DEBUG, information will still be outputted. In addition, errors and warnings are more # severe than info and therefore will always be outputted to the log. ch.setLevel(logging.DEBUG if debug else logging.INFO) ch.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) return ch class NoProgressBar: def __enter__(self): return self def __exit__(*x): pass def create_argument_parser(): parser = argparse.ArgumentParser(prog='hpc-prune') parser.add_argument('config_filename', type=str, default=CONFIG_FILENAME, help='Name of configuration file to use.') parser.add_argument('-i', '--input', type=str, help='Intput file/stream.') parser.add_argument('-o', '--output', type=str, help='Output file/stream.') parser.add_argument('--num_workers', type=int, default=cpu_count(), help='The number of workers to use. Only valid of using dask_multiprocessing, custom_multiprocessing or local as mode. (default: {}).'.format(cpu_count())) return parser def run(args): # Set logging level. logging_debug_opt = False LOGGER.addHandler(create_logging_handler(logging_debug_opt)) LOGGER.setLevel(logging.DEBUG) LOGGER.info("Using configuration {}.".format(args.config_filename)) cfg = ConfigParser() cfg.read(args.config_filename) in_fname = cfg['data']['modules'] if not args.input else args.input LOGGER.info("Loading modules from {}.".format(in_fname)) # Loading from YAML is extremely slow. Therefore this is a potential performance improvement. # Potential improvements are switching to JSON or to use a CLoader: # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml if in_fname.endswith('.yaml'): modules = load_from_yaml(in_fname) else: with open(in_fname, 'rb') as f: modules = pickle.load(f) # Filter out modules with to few genes. min_genes = int(cfg['parameters']['min_genes']) modules = list(filter(lambda m: len(m) >= min_genes, modules)) LOGGER.info("Loading databases.") def name(fname): return os.path.splitext(os.path.basename(fname))[0] db_fnames = list(mapcat(glob.glob, cfg['data']['databases'].split(";"))) dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames] LOGGER.info("Calculating regulons.") motif_annotations_fname = cfg['data']['motif_annotations'] mode= cfg['parameters']['mode'] with ProgressBar() if mode == "dask_multiprocessing" else NoProgressBar(): df = prune2df(dbs, modules, motif_annotations_fname, rank_threshold=int(cfg['parameters']['rank_threshold']), auc_threshold=float(cfg['parameters']['auc_threshold']), nes_threshold=float(cfg['parameters']['nes_threshold']), client_or_address=mode, module_chunksize=cfg['parameters']['chunk_size'], num_workers=args.num_workers) LOGGER.info("Writing results to file.") df.to_csv(cfg['parameters']['output'] if not args.output else args.output) if __name__ == "__main__": parser = create_argument_parser() run(parser.parse_args())