# -*- coding: utf-8 -*- import os # Set number of threads to use for OpenBLAS. os.environ["OPENBLAS_NUM_THREADS"] = "1" # Set number of threads to use for MKL. os.environ["MKL_NUM_THREADS"] = "1" import argparse import logging from shutil import copyfile from dask.diagnostics import ProgressBar from multiprocessing import cpu_count from arboreto.algo import grnboost2, genie3 from arboreto.utils import load_tf_names from pyscenic.utils import modules_from_adjacencies from pyscenic.rnkdb import opendb, RankingDatabase from pyscenic.prune import prune2df, find_features, _prepare_client from pyscenic.aucell import aucell from pyscenic.log import create_logging_handler import sys from typing import Type, Sequence from .utils import load_exp_matrix, load_signatures, save_matrix, save_enriched_motifs, load_adjacencies, load_modules, append_auc_mtx, ATTRIBUTE_NAME_CELL_IDENTIFIER, ATTRIBUTE_NAME_GENE from .utils import is_valid_suffix, suffixes_to_separator from pathlib import Path, PurePath try: from pyscenic._version import get_versions VERSION = get_versions()['version'] except: VERSION = "?.?.?" LOGGER = logging.getLogger(__name__) def find_adjacencies_command(args): """ Infer co-expression modules. """ LOGGER.info("Loading expression matrix.") try: ex_mtx = load_exp_matrix(args.expression_mtx_fname.name, (args.transpose == 'yes'), args.sparse, args.cell_id_attribute, args.gene_attribute) except ValueError as e: LOGGER.error(e) sys.exit(1) tf_names = load_tf_names(args.tfs_fname.name) if args.sparse: n_total_genes = len(ex_mtx[1]) n_matching_genes = len(ex_mtx[1].isin(tf_names)) else: n_total_genes = len(ex_mtx.columns) n_matching_genes = len(ex_mtx.columns.isin(tf_names)) if n_total_genes == 0: LOGGER.error("The expression matrix supplied does not contain any genes. " "Make sure the extension of the file matches the format (tab separation for TSV and " "comma sepatration for CSV).") sys.exit(1) if float(n_matching_genes)/n_total_genes < 0.80: LOGGER.warning("Expression data is available for less than 80% of the supplied transcription factors.") LOGGER.info("Inferring regulatory networks.") client, shutdown_callback = _prepare_client(args.client_or_address, num_workers=args.num_workers) method = grnboost2 if args.method == 'grnboost2' else genie3 try: network = method(expression_data=ex_mtx, tf_names=tf_names, verbose=True, client_or_address=client, seed=args.seed) finally: shutdown_callback(False) LOGGER.info("Writing results to file.") extension = PurePath(args.output.name).suffixes network.to_csv(args.output.name, index=False, sep=suffixes_to_separator(extension)) def adjacencies2modules(args): try: adjacencies = load_adjacencies(args.module_fname.name) except ValueError as e: LOGGER.error(e) sys.exit(1) LOGGER.info("Loading expression matrix.") try: ex_mtx = load_exp_matrix(args.expression_mtx_fname.name, (args.transpose == 'yes'), False, # sparse loading is disabled here for now args.cell_id_attribute, args.gene_attribute) except ValueError as e: LOGGER.error(e) sys.exit(1) return modules_from_adjacencies(adjacencies, ex_mtx, thresholds=args.thresholds, top_n_targets=args.top_n_targets, top_n_regulators=args.top_n_regulators, min_genes=args.min_genes, rho_mask_dropouts=args.mask_dropouts, keep_only_activating=(args.all_modules != "yes")) def _load_dbs(fnames: Sequence[str]) -> Sequence[Type[RankingDatabase]]: def get_name(fname): return os.path.splitext(os.path.basename(fname))[0] return [opendb(fname=fname.name, name=get_name(fname.name)) for fname in fnames] class NoProgressBar: def __enter__(self): return self def __exit__(*x): pass def prune_targets_command(args): """ Prune targets/find enriched features. """ # Loading from YAML is extremely slow. Therefore this is a potential performance improvement. # Potential improvements are switching to JSON or to use a CLoader: # https://stackoverflow.com/questions/27743711/can-i-speedup-yaml # The alternative for which was opted in the end is binary pickling. extension = PurePath(args.module_fname.name).suffixes if is_valid_suffix(extension, 'ctx'): if args.expression_mtx_fname is None: LOGGER.error("No expression matrix is supplied.") sys.exit(0) LOGGER.info("Creating modules.") modules = adjacencies2modules(args) else: LOGGER.info("Loading modules.") try: modules = load_modules(args.module_fname.name) except ValueError as e: LOGGER.error(e) sys.exit(1) if len(modules) == 0: LOGGER.error("Not a single module loaded") sys.exit(1) LOGGER.info("Loading databases.") dbs = _load_dbs(args.database_fname) LOGGER.info("Calculating regulons.") motif_annotations_fname = args.annotations_fname.name calc_func = find_features if args.no_pruning == "yes" else prune2df with ProgressBar() if args.mode == "dask_multiprocessing" else NoProgressBar(): df_motifs = calc_func(dbs, modules, motif_annotations_fname, rank_threshold=args.rank_threshold, auc_threshold=args.auc_threshold, nes_threshold=args.nes_threshold, client_or_address=args.mode, module_chunksize=args.chunk_size, num_workers=args.num_workers) LOGGER.info("Writing results to file.") if args.output.name == '<stdout>': df_motifs.to_csv(args.output) else: save_enriched_motifs(df_motifs, args.output.name) def aucell_command(args): """ Calculate regulon enrichment (as AUC values) for cells. """ LOGGER.info("Loading expression matrix.") try: ex_mtx = load_exp_matrix(args.expression_mtx_fname.name, (args.transpose == 'yes'), False, # sparse loading is disabled here for now args.cell_id_attribute, args.gene_attribute) except ValueError as e: LOGGER.error(e) sys.exit(1) LOGGER.info("Loading gene signatures.") try: signatures = load_signatures(args.signatures_fname.name) except ValueError as e: LOGGER.error(e) sys.exit(1) LOGGER.info("Calculating cellular enrichment.") auc_mtx = aucell(ex_mtx, signatures, auc_threshold=args.auc_threshold, noweights=(args.weights != 'yes'), seed=args.seed, num_workers=args.num_workers) LOGGER.info("Writing results to file.") extension = PurePath(args.output.name).suffixes if '.loom' in extension: try: copyfile(args.expression_mtx_fname.name, args.output.name) append_auc_mtx(args.output.name, auc_mtx, signatures, args.seed, args.num_workers) except OSError as e: LOGGER.error("Expression matrix should be provided in the loom file format.") sys.exit(1) elif args.output.name == '<stdout>': transpose = (args.transpose == 'yes') (auc_mtx.T if transpose else auc_mtx).to_csv(args.output) else: save_matrix(auc_mtx, args.output.name, (args.transpose == 'yes')) def add_recovery_parameters(parser): group = parser.add_argument_group('motif enrichment arguments') group.add_argument('--rank_threshold', type=int, default=5000, help='The rank threshold used for deriving the target genes of an enriched motif (default: 5000).') group.add_argument('--auc_threshold', type=float, default=0.05, help='The threshold used for calculating the AUC of a feature as fraction of ranked genes (default: 0.05).') group.add_argument('--nes_threshold', type=float, default=3.0, help='The Normalized Enrichment Score (NES) threshold for finding enriched features (default: 3.0).') return parser def add_annotation_parameters(parser): group = parser.add_argument_group('motif annotation arguments') group.add_argument('--min_orthologous_identity', type=float, default=0.0, help='Minimum orthologous identity to use when annotating enriched motifs (default: 0.0).') group.add_argument('--max_similarity_fdr', type=float, default=0.001, help='Maximum FDR in motif similarity to use when annotating enriched motifs (default: 0.001).') group.add_argument('--annotations_fname', type=argparse.FileType('r'), help='The name of the file that contains the motif annotations to use.', required=True) return parser def add_module_parameters(parser): group = parser.add_argument_group('module generation arguments') group.add_argument('--thresholds', type=float, nargs='+', default=[0.75,0.90], help='The first method to create the TF-modules based on the best targets for each transcription factor (default: 0.75 0.90).') group.add_argument('--top_n_targets', type=int, nargs='+', default=[50], help='The second method is to select the top targets for a given TF. (default: 50)') group.add_argument('--top_n_regulators', type=int, nargs='+', default=[5,10,50], help='The alternative way to create the TF-modules is to select the best regulators for each gene. (default: 5 10 50)') group.add_argument('--min_genes', type=int, default=20, help='The minimum number of genes in a module (default: 20).') group.add_argument('--expression_mtx_fname', type=argparse.FileType('r'), help='The name of the file that contains the expression matrix for the single cell experiment.' ' Two file formats are supported: csv (rows=cells x columns=genes) or loom (rows=genes x columns=cells).' ' (Only required if modules need to be generated)') group.add_argument('--mask_dropouts', action='store_const', const=True, default=False, help='If modules need to be generated, this controls whether cell dropouts (cells in which expression of either TF or target gene is 0) are masked when calculating the correlation between a TF-target pair.' ' This affects which target genes are included in the initial modules, and the final pruned regulon (by default only positive regulons are kept (see --all_modules option)).' ' The default value in pySCENIC 0.9.16 and previous versions was to mask dropouts when calculating the correlation; however, all cells are now kept by default, to match the R version.') return parser def add_computation_parameters(parser): group = parser.add_argument_group('computation arguments') group.add_argument('--num_workers', type=int, default=cpu_count(), help='The number of workers to use. Only valid of using dask_multiprocessing, custom_multiprocessing or local as mode. (default: {}).'.format(cpu_count())) group.add_argument('--client_or_address', type=str, default='local', help='The client or the IP address of the dask scheduler to use.' ' (Only required of dask_cluster is selected as mode)') return parser def add_loom_parameters(parser): group = parser.add_argument_group('loom file arguments') group.add_argument('--cell_id_attribute', type=str, default=ATTRIBUTE_NAME_CELL_IDENTIFIER, help='The name of the column attribute that specifies the identifiers of the cells in the loom file.') group.add_argument('--gene_attribute', type=str, default=ATTRIBUTE_NAME_GENE, help='The name of the row attribute that specifies the gene symbols in the loom file.') group.add_argument('--sparse', action='store_const', const=True, default=False, help='If set, load the expression data as a sparse matrix. Currently applies to the grn inference step only.') return parser def create_argument_parser(): parser = argparse.ArgumentParser(prog=os.path.splitext(os.path.basename(__file__))[0], description='Single-CEll regulatory Network Inference and Clustering ({})'.format(VERSION), fromfile_prefix_chars='@', add_help=True, epilog="Arguments can be read from file using a @args.txt construct. " "For more information on loom file format see http://loompy.org . " "For more information on gmt file format see https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats .") subparsers = parser.add_subparsers(help='sub-command help') # -------------------------------------------- # create the parser for the "grn" command # -------------------------------------------- parser_grn = subparsers.add_parser('grn', help='Derive co-expression modules from expression matrix.') parser_grn.add_argument('expression_mtx_fname', type=argparse.FileType('r'), help='The name of the file that contains the expression matrix for the single cell experiment.' ' Two file formats are supported: csv (rows=cells x columns=genes) or loom (rows=genes x columns=cells).') parser_grn.add_argument('tfs_fname', type=argparse.FileType('r'), help='The name of the file that contains the list of transcription factors (TXT; one TF per line).') parser_grn.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help='Output file/stream, i.e. a table of TF-target genes (CSV).') parser_grn.add_argument('-t', '--transpose', action='store_const', const = 'yes', help='Transpose the expression matrix (rows=genes x columns=cells).') parser_grn.add_argument('-m', '--method', choices=['genie3', 'grnboost2'], default='grnboost2', help='The algorithm for gene regulatory network reconstruction (default: grnboost2).') parser_grn.add_argument('--seed', type=int, required=False, default=None, help='Seed value for regressor random state initialization. Applies to both GENIE3 and GRNBoost2. The default is to use a random seed.') add_computation_parameters(parser_grn) add_loom_parameters(parser_grn) parser_grn.set_defaults(func=find_adjacencies_command) #----------------------------------------- # create the parser for the "ctx" command #----------------------------------------- parser_ctx = subparsers.add_parser('ctx', help='Find enriched motifs for a gene signature and optionally prune targets from this signature based on cis-regulatory cues.') parser_ctx.add_argument('module_fname', type=argparse.FileType('r'), help='The name of the file that contains the signature or the co-expression modules. ' 'The following formats are supported: CSV or TSV (adjacencies), YAML, GMT and DAT (modules)') parser_ctx.add_argument('database_fname', type=argparse.FileType('r'), nargs='+', help='The name(s) of the regulatory feature databases. ' 'Two file formats are supported: feather or db (legacy).') parser_ctx.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help='Output file/stream, i.e. a table of enriched motifs and target genes (csv, tsv)' ' or collection of regulons (yaml, gmt, dat, json).') parser_ctx.add_argument('-n', '--no_pruning', action='store_const', const = 'yes', help='Do not perform pruning, i.e. find enriched motifs.') parser_ctx.add_argument('--chunk_size', type=int, default=100, help='The size of the module chunks assigned to a node in the dask graph (default: 100).') parser_ctx.add_argument('--mode', choices=['custom_multiprocessing', 'dask_multiprocessing', 'dask_cluster'], default='dask_multiprocessing', help='The mode to be used for computing (default: dask_multiprocessing).') parser_ctx.add_argument('-a', '--all_modules', action='store_const', const = 'yes', default='no', help='Included positive and negative regulons in the analysis (default: no, i.e. only positive).') parser_ctx.add_argument('-t', '--transpose', action='store_const', const = 'yes', help='Transpose the expression matrix (rows=genes x columns=cells).') add_recovery_parameters(parser_ctx) add_annotation_parameters(parser_ctx) add_computation_parameters(parser_ctx) add_module_parameters(parser_ctx) add_loom_parameters(parser_ctx) parser_ctx.set_defaults(func=prune_targets_command) #-------------------------------------------- # create the parser for the "aucell" command # ------------------------------------------- parser_aucell = subparsers.add_parser('aucell', help='Quantify activity of gene signatures across single cells.') # Mandatory arguments parser_aucell.add_argument('expression_mtx_fname', type=argparse.FileType('r'), help='The name of the file that contains the expression matrix for the single cell experiment.' ' Two file formats are supported: csv (rows=cells x columns=genes) or loom (rows=genes x columns=cells).') parser_aucell.add_argument('signatures_fname', type=argparse.FileType('r'), help='The name of the file that contains the gene signatures.' ' Three file formats are supported: gmt, yaml or dat (pickle).') # Optional arguments parser_aucell.add_argument('-o', '--output', type=argparse.FileType('w'), default=sys.stdout, help='Output file/stream, a matrix of AUC values.' ' Two file formats are supported: csv or loom.' ' If loom file is specified the loom file while contain the original expression matrix and the' ' calculated AUC values as extra column attributes.') parser_aucell.add_argument('-t', '--transpose', action='store_const', const = 'yes', help='Transpose the expression matrix if supplied as csv (rows=genes x columns=cells).') parser_aucell.add_argument('-w', '--weights', action='store_const', const = 'yes', help='Use weights associated with genes in recovery analysis.' ' Is only relevant when gene signatures are supplied as json format.') parser_aucell.add_argument('--num_workers', type=int, default=cpu_count(), help='The number of workers to use (default: {}).'.format(cpu_count())) parser_aucell.add_argument('--seed', type=int, required=False, default=None, help='Seed for the expression matrix ranking step. The default is to use a random seed.') add_recovery_parameters(parser_aucell) add_loom_parameters(parser_aucell) parser_aucell.set_defaults(func=aucell_command) return parser def main(argv=None): # Parse arguments. parser = create_argument_parser() args = parser.parse_args(args=argv) if not hasattr(args, 'func'): parser.print_help() else: args.func(args) if __name__ == "__main__": main()