python source code of rnaseq_unc

#!/usr/bin/env python2.7
"""
UNC Best Practice RNA-Seq Pipeline
Author: John Vivian
Affiliation: UC Santa Cruz Genomics Institute

Please see the README.md in the same directory

Tree Structure of RNA-Seq Pipeline (per sample)

    0---> 2
    |     |
    1     3 - - - - -> Consolidate Output -> Upload_to_S3
         / \
       *4   5
            |
            6
           / \
         *7   8
             / \
           *9   10
                |
                11
                |
                12
                |
               *13

0 = Start Node
1 = Download Sample
2 = Unzip
3 = Mapsplice
4 = Mapping Stats (not currently included)
5 = Add Read Groups
6 = Bamsort and Index
7 = Rseq-QC
8 = Sort Bam by Reference
9 = Exon Quantification
10 = Transcriptome
11 = Filter
12 = RSEM
13 = RSEM Post-Process

7,9,13 contribute to producing the final output

Dependencies
Curl:       apt-get install curl
Docker:     apt-get install docker.io # docker.io if using linux, o.w. just docker
Samtools:   apt-get install samtools
Unzip:      apt-get install unzip
Toil:       pip install git+https://github.com/BD2KGenomics/toil.git

Optional
Boto:       pip install boto
"""
import argparse
import base64
import errno
import glob
import hashlib
import multiprocessing
import os
import shutil
import subprocess
import tarfile
from collections import OrderedDict
from contextlib import closing
from urlparse import urlparse

from toil.job import Job


def build_parser():
    parser = argparse.ArgumentParser(description=main.__doc__, add_help=True)
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--config', default=None, help='Path to config. One sample per line, with the format: '
                                                      'uuid,url_to_sample.tar')
    group.add_argument('--input', default=None, help='Accepts a local sample: /path/to/sample.tar. Take note! The'
                                                     'UUID for this sample is derived from the name. So samples'
                                                     'should be in the form of uuid.tar.')
    group.add_argument('-f', '--config_fastq', default=None,
                       help='Path to CSV. One sample per line with the format: '
                            'uuid,file:///path/to/R_1.fastq,file:///path/to/R_2.fastq')
    parser.add_argument('--single_end_reads', default=False, action='store_true',
                        help='Set this flag if input data is non-paired (single end reads).')
    parser.add_argument('--unc', help='URL to unc_hg19.bed',
                        default='https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/rna-seq/unc_hg19.bed')
    parser.add_argument('--fasta', help='URL to hg19_M_rCRS_ref.transcripts.fa',
                        default='https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/rna-seq/hg19_M_rCRS_ref.transcripts.fa')
    parser.add_argument('--composite_exons', help='URL to composite_exons.bed',
                        default='https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/rna-seq/composite_exons.bed')
    parser.add_argument('--normalize', help='URL to normalizeBedToolsExonQuant.pl',
                        default='https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/rna-seq/normalizeBedToolsExonQuant.pl')
    parser.add_argument('--rsem_ref', help='RSEM_REF URL',
                        default='https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/rna-seq/rsem_ref.zip')
    parser.add_argument('--chromosomes', help='Chromosomes Directory',
                        default='https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/rna-seq/chromosomes.zip')
    parser.add_argument('--ebwt', help='EBWT Directory',
                        default='https://s3-us-west-2.amazonaws.com/cgl-pipeline-inputs/rna-seq/ebwt.zip')
    parser.add_argument('--ssec', help='Path to Key File for SSE-C Encryption')
    parser.add_argument('--output_dir', default=None, help='full path where final results will be output')
    parser.add_argument('--upload_bam_to_s3', default=False, action='store_true',
                        help='uploads alignment bam to S3 directory specified.')
    parser.add_argument('--s3_dir', default=None, help='S3 Directory, starting with bucket name. e.g.: '
                                                       'cgl-driver-projects/ckcc/rna-seq-samples/')
    parser.add_argument('--sudo', dest='sudo', action='store_true', default=False,
                        help='Docker usually needs sudo to execute locally, but not when running Mesos or when '
                             'the user is a member of a Docker group.')
    return parser


# Convenience functions used in the pipeline
def mkdir_p(path):
    """
    It is Easier to Ask for Forgiveness than Permission
    """
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise


def flatten(x):
    """
    Flattens a nested array into a single list

    x: list/tuple       The nested list/tuple to be flattened.
    """
    result = []
    for el in x:
        if hasattr(el, "__iter__") and not isinstance(el, basestring):
            result.extend(flatten(el))
        else:
            result.append(el)
    return result


def which(program):
    import os

    def is_exe(f):
        return os.path.isfile(f) and os.access(f, os.X_OK)

    fpath, fname = os.path.split(program)
    if fpath:
        if is_exe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            path = path.strip('"')
            exe_file = os.path.join(path, program)
            if is_exe(exe_file):
                return exe_file

    return None


def generate_unique_key(master_key_path, url):
    """
    master_key_path: str    Path to the BD2K Master Key (for S3 Encryption)
    url: str                S3 URL (e.g. https://s3-us-west-2.amazonaws.com/bucket/file.txt)

    Returns: str            32-byte unique key generated for that URL
    """
    with open(master_key_path, 'r') as f:
        master_key = f.read()
    assert len(master_key) == 32, 'Invalid Key! Must be 32 characters. ' \
                                  'Key: {}, Length: {}'.format(master_key, len(master_key))
    new_key = hashlib.sha256(master_key + url).digest()
    assert len(new_key) == 32, 'New key is invalid and is not 32 characters: {}'.format(new_key)
    return new_key


def download_encrypted_file(job, input_args, name):
    """
    Downloads encrypted files from S3 via header injection

    input_args: dict    Input dictionary defined in main()
    name: str           Symbolic name associated with file
    """
    work_dir = job.fileStore.getLocalTempDir()
    key_path = input_args['ssec']
    file_path = os.path.join(work_dir, name)
    url = input_args[name]

    with open(key_path, 'r') as f:
        key = f.read()
    if len(key) != 32:
        raise RuntimeError('Invalid Key! Must be 32 bytes: {}'.format(key))

    key = generate_unique_key(key_path, url)

    encoded_key = base64.b64encode(key)
    encoded_key_md5 = base64.b64encode(hashlib.md5(key).digest())
    h1 = 'x-amz-server-side-encryption-customer-algorithm:AES256'
    h2 = 'x-amz-server-side-encryption-customer-key:{}'.format(encoded_key)
    h3 = 'x-amz-server-side-encryption-customer-key-md5:{}'.format(encoded_key_md5)
    try:
        subprocess.check_call(['curl', '-fs', '--retry', '5', '-H', h1, '-H', h2, '-H', h3, url, '-o', file_path])
    except OSError:
        raise RuntimeError('Failed to find "curl". Install via "apt-get install curl"')
    assert os.path.exists(file_path)
    return job.fileStore.writeGlobalFile(file_path)


def download_from_url(job, url):
    """
    Simple curl request made for a given url

    url: str    URL to download
    """
    work_dir = job.fileStore.getLocalTempDir()
    file_path = os.path.join(work_dir, os.path.basename(url))
    if not os.path.exists(file_path):
        if url.startswith('s3:'):
            download_from_s3_url(file_path, url)
        else:
            try:
                subprocess.check_call(['curl', '-fs', '--retry', '5', '--create-dir', url, '-o', file_path])
            except OSError:
                raise RuntimeError('Failed to find "curl". Install via "apt-get install curl"')
    assert os.path.exists(file_path)
    return job.fileStore.writeGlobalFile(file_path)


def return_input_paths(job, work_dir, ids, *args):
    """
    Given one or more strings representing file_names, return the paths to those files. Each item must be unpacked!

    work_dir: str       Current working directory
    ids: dict           Dictionary of fileStore IDs
    *args: str(s)       for every file in *args, place file in work_dir via FileStore
    """
    paths = OrderedDict()
    for name in args:
        if not os.path.exists(os.path.join(work_dir, name)):
            file_path = job.fileStore.readGlobalFile(ids[name], os.path.join(work_dir, name))
        else:
            file_path = os.path.join(work_dir, name)
        paths[name] = file_path
        if len(args) == 1:
            return file_path

    return paths.values()


def docker_path(filepath):
    """
    Given a path, returns that files path inside the docker mount directory (/data).
    """
    return os.path.join('/data', os.path.basename(filepath))


def docker_call(work_dir, tool_parameters, tool, java_opts=None, outfile=None, sudo=False):
    """
    Makes subprocess call of a command to a docker container.


    tool_parameters: list   An array of the parameters to be passed to the tool
    tool: str               Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools)
    java_opts: str          Optional commands to pass to a java jar execution. (e.g. '-Xmx15G')
    outfile: file           Filehandle that stderr will be passed to
    sudo: bool              If the user wants the docker command executed as sudo
    """
    base_docker_call = 'docker run --log-driver=none --rm -v {}:/data'.format(work_dir).split()
    if sudo:
        base_docker_call = ['sudo'] + base_docker_call
    if java_opts:
        base_docker_call = base_docker_call + ['-e', 'JAVA_OPTS={}'.format(java_opts)]
    try:
        if outfile:
            subprocess.check_call(base_docker_call + [tool] + tool_parameters, stdout=outfile)
        else:
            subprocess.check_call(base_docker_call + [tool] + tool_parameters)
    except subprocess.CalledProcessError:
        raise RuntimeError('docker command returned a non-zero exit status. Check error logs.')
    except OSError:
        raise RuntimeError('docker not found on system. Install on all nodes.')


def copy_to_output_dir(work_dir, output_dir, uuid=None, files=list()):
    """
    A list of files to move from work_dir to output_dir.

    work_dir: str       Current working directory
    output_dir: str     Output directory for files to go
    uuid: str           UUID to "stamp" onto output files
    files: list         List of files to iterate through
    """
    for fname in files:
        if uuid is None:
            shutil.copy(os.path.join(work_dir, fname), os.path.join(output_dir, fname))
        else:
            shutil.copy(os.path.join(work_dir, fname), os.path.join(output_dir, '{}.{}'.format(uuid, fname)))


def tarball_files(work_dir, tar_name, uuid=None, files=None):
    """
    Tars a group of files together into a tarball

    work_dir: str       Current Working Directory
    tar_name: str       Name of tarball
    uuid: str           UUID to stamp files with
    files: str(s)       List of filenames to place in the tarball from working directory
    """
    with tarfile.open(os.path.join(work_dir, tar_name), 'w:gz') as f_out:
        for fname in files:
            if uuid:
                f_out.add(os.path.join(work_dir, fname), arcname=uuid + '.' + fname)
            else:
                f_out.add(os.path.join(work_dir, fname), arcname=fname)


def download_from_s3_url(file_path, url):
    from urlparse import urlparse
    from boto.s3.connection import S3Connection
    s3 = S3Connection()
    try:
        parsed_url = urlparse(url)
        if not parsed_url.netloc or not parsed_url.path.startswith('/'):
            raise RuntimeError("An S3 URL must be of the form s3:/BUCKET/ or "
                               "s3://BUCKET/KEY. '%s' is not." % url)
        bucket = s3.get_bucket(parsed_url.netloc)
        key = bucket.get_key(parsed_url.path[1:])
        key.get_contents_to_filename(file_path)
    finally:
        s3.close()


# Job Functions
def program_checks(job, input_args):
    """
    Checks that dependency programs are installed.

    input_args: dict        Dictionary of input arguments (from main())
    """
    # Program checks
    for program in ['curl', 'docker', 'unzip', 'samtools']:
        assert which(program), 'Program "{}" must be installed on every node.'.format(program)
    job.addChildJobFn(download_shared_files, input_args)


def download_shared_files(job, input_args):
    """
    Downloads and stores shared inputs files in the FileStore

    input_args: dict        Dictionary of input arguments (from main())
    """
    shared_files = ['unc.bed', 'hg19.transcripts.fa', 'composite_exons.bed', 'normalize.pl', 'rsem_ref.zip',
                    'ebwt.zip', 'chromosomes.zip']
    shared_ids = {}
    for f in shared_files:
        shared_ids[f] = job.addChildJobFn(download_from_url, input_args[f]).rv()
    if input_args['config'] or input_args['config_fastq']:
        job.addFollowOnJobFn(parse_config_file, shared_ids, input_args)
    else:
        sample_path = input_args['input']
        uuid = os.path.splitext(os.path.basename(sample_path))[0]
        sample = (uuid, sample_path)
        job.addFollowOnJobFn(download_sample, shared_ids, input_args, sample)


def parse_config_file(job, ids, input_args):
    """
    Launches pipeline for each sample.

    shared_ids: dict        Dictionary of fileStore IDs
    input_args: dict        Dictionary of input arguments
    """
    samples = []
    config = input_args['config']
    with open(config, 'r') as f:
        for line in f.readlines():
            if not line.isspace():
                sample = line.strip().split(',')
                samples.append(sample)
    for sample in samples:
        job.addChildJobFn(download_sample, ids, input_args, sample)


def download_sample(job, ids, input_args, sample):
    """
    Defines variables unique to a sample that are used in the rest of the pipelines

    ids: dict           Dictionary of fileStore IDS
    input_args: dict    Dictionary of input arguments
    sample: tuple       Contains uuid and sample_url
    """
    if len(sample) == 2:
        uuid, sample_location = sample
        url1, url2 = None, None
    else:
        uuid, url1, url2 = sample
        sample_location = None
    # Update values unique to sample
    sample_input = dict(input_args)
    sample_input['uuid'] = uuid
    sample_input['sample.tar'] = sample_location
    if sample_input['output_dir']:
        sample_input['output_dir'] = os.path.join(input_args['output_dir'], uuid)
    sample_input['cpu_count'] = multiprocessing.cpu_count()
    job_vars = (sample_input, ids)
    # Download or locate local file and place in the jobStore
    if sample_input['input']:
        ids['sample.tar'] = job.fileStore.writeGlobalFile(os.path.abspath(sample_location))
    elif sample_input['config_fastq']:
        ids['R1.fastq'] = job.fileStore.writeGlobalFile(urlparse(url1).path)
        ids['R2.fastq'] = job.fileStore.writeGlobalFile(urlparse(url2).path)
    else:
        if sample_input['ssec']:
            ids['sample.tar'] = job.addChildJobFn(download_encrypted_file, sample_input, 'sample.tar', disk='25G').rv()
        else:
            ids['sample.tar'] = job.addChildJobFn(download_from_url, sample_input['sample.tar'], disk='25G').rv()
    job.addFollowOnJobFn(static_dag_launchpoint, job_vars)


def static_dag_launchpoint(job, job_vars):
    """
    Statically define jobs in the pipeline

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    input_args, ids = job_vars
    if input_args['config_fastq']:
        cores = input_args['cpu_count']
        a = job.wrapJobFn(mapsplice, job_vars, cores=cores, disk='130G').encapsulate()
    else:
        a = job.wrapJobFn(merge_fastqs, job_vars, disk='70 G').encapsulate()
    b = job.wrapJobFn(consolidate_output, job_vars, a.rv())
    # Take advantage of "encapsulate" to simplify pipeline wiring
    job.addChild(a)
    a.addChild(b)


def merge_fastqs(job, job_vars):
    """
    Unzips input sample and concats the Read1 and Read2 groups together.

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    cores = input_args['cpu_count']
    single_end_reads = input_args['single_end_reads']
    # I/O
    sample = return_input_paths(job, work_dir, ids, 'sample.tar')
    # Untar File
    # subprocess.check_call(['unzip', sample, '-d', work_dir])
    subprocess.check_call(['tar', '-xvf', sample, '-C', work_dir])
    # Remove large files before creating concat versions.
    os.remove(os.path.join(work_dir, 'sample.tar'))
    # Zcat files in parallel
    if single_end_reads:
        files = sorted(glob.glob(os.path.join(work_dir, '*')))
        with open(os.path.join(work_dir, 'R1.fastq'), 'w') as f1:
            subprocess.check_call(['zcat'] + files, stdout=f1)
        # FileStore
        ids['R1.fastq'] = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R1.fastq'))
    else:
        r1_files = sorted(glob.glob(os.path.join(work_dir, '*R1*')))
        r2_files = sorted(glob.glob(os.path.join(work_dir, '*R2*')))
        with open(os.path.join(work_dir, 'R1.fastq'), 'w') as f1:
            p1 = subprocess.Popen(['zcat'] + r1_files, stdout=f1)
        with open(os.path.join(work_dir, 'R2.fastq'), 'w') as f2:
            p2 = subprocess.Popen(['zcat'] + r2_files, stdout=f2)
        p1.wait()
        p2.wait()
        # FileStore
        ids['R1.fastq'] = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R1.fastq'))
        ids['R2.fastq'] = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'R2.fastq'))
    job.fileStore.deleteGlobalFile(ids['sample.tar'])
    # Spawn child job
    return job.addChildJobFn(mapsplice, job_vars, cores=cores, disk='130 G').rv()


def mapsplice(job, job_vars):
    """
    Maps RNA-Seq reads to a reference genome.

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    # Unpack variables
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    cores = input_args['cpu_count']
    sudo = input_args['sudo']
    single_end_reads = input_args['single_end_reads']
    files_to_delete = ['R1.fastq']
    # I/O
    return_input_paths(job, work_dir, ids, 'ebwt.zip', 'chromosomes.zip')
    if single_end_reads:
        return_input_paths(job, work_dir, ids, 'R1.fastq')
    else:
        return_input_paths(job, work_dir, ids, 'R1.fastq', 'R2.fastq')
        files_to_delete.extend(['R2.fastq'])
    for fname in ['chromosomes.zip', 'ebwt.zip']:
        subprocess.check_call(['unzip', '-o', os.path.join(work_dir, fname), '-d', work_dir])
    # Command and call
    parameters = ['-p', str(cores),
                  '-s', '25',
                  '--bam',
                  '--min-map-len', '50',
                  '-x', '/data/ebwt',
                  '-c', '/data/chromosomes',
                  '-1', '/data/R1.fastq',
                  '-o', '/data']
    if not single_end_reads:
        parameters.extend(['-2', '/data/R2.fastq'])
    docker_call(tool='quay.io/ucsc_cgl/mapsplice:2.1.8--dd5ac549b95eb3e5d166a5e310417ef13651994e',
                tool_parameters=parameters, work_dir=work_dir, sudo=sudo)
    # Write to FileStore
    for fname in ['alignments.bam', 'stats.txt']:
        ids[fname] = job.fileStore.writeGlobalFile(os.path.join(work_dir, fname))
    for fname in files_to_delete:
        job.fileStore.deleteGlobalFile(ids[fname])
    # Run child job
    # map_id = job.addChildJobFn(mapping_stats, job_vars).rv()
    if input_args['upload_bam_to_s3'] and input_args['s3_dir']:
        job.addChildJobFn(upload_bam_to_s3, job_vars)
    output_ids = job.addChildJobFn(add_read_groups, job_vars, disk='30 G').rv()
    return output_ids


def mapping_stats(job, job_vars):
    """
    This function is not currently in use.

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    # Unpack variables
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    sudo = input_args['sudo']
    # I/O
    return_input_paths(job, work_dir, ids, 'stats.txt')
    uuid = input_args['uuid']
    # Command
    docker_call(tool='jvivian/mapping_stats', tool_parameters=[uuid], work_dir=work_dir, sudo=sudo)
    # Zip output files and store
    output_files = ['{}_stats2.txt'.format(uuid), '{}_stats_all.txt'.format(uuid), '{}_mapping.tab'.format(uuid)]
    tarball_files(work_dir, tar_name='map.tar.gz', files=output_files)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'map.tar.gz'))


def add_read_groups(job, job_vars):
    """
    This function adds read groups to the headers

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    sudo = input_args['sudo']
    # I/O
    alignments = return_input_paths(job, work_dir, ids, 'alignments.bam')
    output = os.path.join(work_dir, 'rg_alignments.bam')
    # Command and callg
    parameter = ['AddOrReplaceReadGroups',
                 'INPUT={}'.format(docker_path(alignments)),
                 'OUTPUT={}'.format(docker_path(output)),
                 'RGSM={}'.format(input_args['uuid']),
                 'RGID={}'.format(input_args['uuid']),
                 'RGLB=TruSeq',
                 'RGPL=illumina',
                 'RGPU=barcode',
                 'VALIDATION_STRINGENCY=SILENT']
    docker_call(tool='quay.io/ucsc_cgl/picardtools:1.95--dd5ac549b95eb3e5d166a5e310417ef13651994e',
                tool_parameters=parameter, work_dir=work_dir, sudo=sudo)
    # Write to FileStore
    ids['rg_alignments.bam'] = job.fileStore.writeGlobalFile(output)
    # Run child job
    return job.addChildJobFn(bamsort_and_index, job_vars, disk='30 G').rv()


def bamsort_and_index(job, job_vars):
    """
    Sorts bam file and produces index file

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    # Unpack variables
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    sudo = input_args['sudo']
    # I/O
    rg_alignments = return_input_paths(job, work_dir, ids, 'rg_alignments.bam')
    output = os.path.join(work_dir, 'sorted.bam')
    # Command -- second argument is "Output Prefix"
    cmd1 = ['sort', docker_path(rg_alignments), docker_path('sorted')]
    cmd2 = ['index', docker_path(output)]
    docker_call(tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e',
                tool_parameters=cmd1, work_dir=work_dir, sudo=sudo)
    docker_call(tool='quay.io/ucsc_cgl/samtools:0.1.19--dd5ac549b95eb3e5d166a5e310417ef13651994e',
                tool_parameters=cmd2, work_dir=work_dir, sudo=sudo)
    # Write to FileStore
    ids['sorted.bam'] = job.fileStore.writeGlobalFile(output)
    ids['sorted.bam.bai'] = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'sorted.bam.bai'))
    # Run child job
    output_ids = job.addChildJobFn(sort_bam_by_reference, job_vars, disk='50 G').rv()
    rseq_id = job.addChildJobFn(rseq_qc, job_vars, disk='20 G').rv()
    return rseq_id, output_ids


def rseq_qc(job, job_vars):
    """
    QC module: contains QC metrics and information about the BAM post alignment

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    uuid = input_args['uuid']
    sudo = input_args['sudo']
    # I/O
    return_input_paths(job, work_dir, ids, 'sorted.bam', 'sorted.bam.bai')
    # Command
    docker_call(tool='jvivian/qc', tool_parameters=['/opt/cgl-docker-lib/RseqQC_v2.sh', '/data/sorted.bam', uuid],
                work_dir=work_dir, sudo=sudo)
    # Write to FileStore
    output_files = [f for f in glob.glob(os.path.join(work_dir, '*')) if 'sorted.bam' not in f]
    tarball_files(work_dir, tar_name='qc.tar.gz', uuid=None, files=output_files)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'qc.tar.gz'))


def sort_bam_by_reference(job, job_vars):
    """
    Sorts the bam by reference

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    # Unpack variables
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    # I/O
    sorted_bam, sorted_bai = return_input_paths(job, work_dir, ids, 'sorted.bam', 'sorted.bam.bai')
    output = os.path.join(work_dir, 'sort_by_ref.bam')
    # Call: Samtools
    ref_seqs = []
    handle = subprocess.Popen(["samtools", "view", "-H", sorted_bam], stdout=subprocess.PIPE).stdout
    for line in handle:
        if line.startswith("@SQ"):
            tmp = line.split("\t")
            chrom = tmp[1].split(":")[1]
            ref_seqs.append(chrom)
    handle.close()
    # Iterate through chromosomes to create mini-bams
    for chrom in ref_seqs:
        # job.addChildJobFn(sbbr_child, chrom, os.path.join(work_dir, chrom), sorted_bam)
        cmd_view = ["samtools", "view", "-b", sorted_bam, chrom]
        cmd_sort = ["samtools", "sort", "-m", "3000000000", "-n", "-", os.path.join(work_dir, chrom)]
        p1 = subprocess.Popen(cmd_view, stdout=subprocess.PIPE)
        subprocess.check_call(cmd_sort, stdin=p1.stdout)
    sorted_files = [os.path.join(work_dir, chrom) + '.bam' for chrom in ref_seqs]
    cmd = ["samtools", "cat", "-o", output] + sorted_files
    subprocess.check_call(cmd)
    # Write to FileStore
    ids['sort_by_ref.bam'] = job.fileStore.writeGlobalFile(output)
    rsem_id = job.addChildJobFn(transcriptome, job_vars, disk='30 G', memory='30 G').rv()
    exon_id = job.addChildJobFn(exon_count, job_vars, disk='30 G').rv()
    return exon_id, rsem_id


def exon_count(job, job_vars):
    """
    Produces exon counts

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    uuid = input_args['uuid']
    sudo = input_args['sudo']
    # I/O
    sort_by_ref, normalize_pl, composite_bed = return_input_paths(job, work_dir, ids, 'sort_by_ref.bam',
                                                                  'normalize.pl', 'composite_exons.bed')
    # Command
    tool = 'jvivian/bedtools'
    cmd_1 = ['coverage',
             '-split',
             '-abam', docker_path(sort_by_ref),
             '-b', docker_path(composite_bed)]
    cmd_2 = ['perl',
             os.path.join(work_dir, 'normalize.pl'),
             sort_by_ref,
             composite_bed]

    popen_docker = ['docker', 'run', '-v', '{}:/data'.format(work_dir), tool]
    if sudo:
        popen_docker = ['sudo'] + popen_docker
    p = subprocess.Popen(popen_docker + cmd_1, stdout=subprocess.PIPE)
    with open(os.path.join(work_dir, 'exon_quant'), 'w') as f:
        subprocess.check_call(cmd_2, stdin=p.stdout, stdout=f)

    p1 = subprocess.Popen(['cat', os.path.join(work_dir, 'exon_quant')], stdout=subprocess.PIPE)
    p2 = subprocess.Popen(['tr', '":"', '"\t"'], stdin=p1.stdout, stdout=subprocess.PIPE)
    p3 = subprocess.Popen(['tr', '"-"', '"\t"'], stdin=p2.stdout, stdout=subprocess.PIPE)
    with open(os.path.join(work_dir, 'exon_quant.bed'), 'w') as f:
        subprocess.check_call(['cut', '-f1-4'], stdin=p3.stdout, stdout=f)
    # Create zip, upload to fileStore, and move to output_dir as a backup
    output_files = ['exon_quant.bed', 'exon_quant']
    tarball_files(work_dir, tar_name='exon.tar.gz', uuid=uuid, files=output_files)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'exon.tar.gz'))


def transcriptome(job, job_vars):
    """
    Creates a bam of just the transcriptome

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    sudo = input_args['sudo']
    # I/O
    sort_by_ref, bed, hg19_fa = return_input_paths(job, work_dir, ids, 'sort_by_ref.bam',
                                                   'unc.bed', 'hg19.transcripts.fa')
    output = os.path.join(work_dir, 'transcriptome.bam')
    # Command
    parameters = ['sam-xlate',
                  '--bed', docker_path(bed),
                  '--in', docker_path(sort_by_ref),
                  '--order', docker_path(hg19_fa),
                  '--out', docker_path(output),
                  '--xgtag',
                  '--reverse']
    docker_call(tool='quay.io/ucsc_cgl/ubu:1.2--02806964cdf74bf5c39411b236b4c4e36d026843',
                tool_parameters=parameters, work_dir=work_dir, java_opts='-Xmx30g', sudo=sudo)
    # Write to FileStore
    ids['transcriptome.bam'] = job.fileStore.writeGlobalFile(output)
    # Run child job
    return job.addChildJobFn(filter_bam, job_vars, memory='30G', disk='30G').rv()


def filter_bam(job, job_vars):
    """
    Performs filtering on the transcriptome bam

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    cores = input_args['cpu_count']
    sudo = input_args['sudo']
    # I/O
    transcriptome_bam = return_input_paths(job, work_dir, ids, 'transcriptome.bam')
    output = os.path.join(work_dir, 'filtered.bam')
    # Command
    parameters = ['sam-filter',
                  '--strip-indels',
                  '--max-insert', '1000',
                  '--mapq', '1',
                  '--in', docker_path(transcriptome_bam),
                  '--out', docker_path(output)]
    docker_call(tool='quay.io/ucsc_cgl/ubu:1.2--02806964cdf74bf5c39411b236b4c4e36d026843',
                tool_parameters=parameters, work_dir=os.path.dirname(output), java_opts='-Xmx30g', sudo=sudo)
    # Write to FileStore
    ids['filtered.bam'] = job.fileStore.writeGlobalFile(output)
    # Run child job
    return job.addChildJobFn(rsem, job_vars, cores=cores, disk='30 G').rv()


def rsem(job, job_vars):
    """
    Runs RSEM to produce counts

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    cpus = input_args['cpu_count']
    sudo = input_args['sudo']
    single_end_reads = input_args['single_end_reads']
    # I/O
    filtered_bam, rsem_ref = return_input_paths(job, work_dir, ids, 'filtered.bam', 'rsem_ref.zip')
    subprocess.check_call(['unzip', '-o', os.path.join(work_dir, 'rsem_ref.zip'), '-d', work_dir])
    output_prefix = 'rsem'
    # Make tool call to Docker
    parameters = ['--quiet',
                  '--no-qualities',
                  '-p', str(cpus),
                  '--forward-prob', '0.5',
                  '--seed-length', '25',
                  '--fragment-length-mean', '-1.0',
                  '--bam', docker_path(filtered_bam)]
    if not single_end_reads:
        parameters.extend(['--paired-end'])
    parameters.extend(['/data/rsem_ref/hg19_M_rCRS_ref', output_prefix])

    docker_call(tool='quay.io/ucsc_cgl/rsem:1.2.25--4e8d1b31d4028f464b3409c6558fb9dfcad73f88',
                tool_parameters=parameters, work_dir=work_dir, sudo=sudo)
    os.rename(os.path.join(work_dir, output_prefix + '.genes.results'), os.path.join(work_dir, 'rsem_gene.tab'))
    os.rename(os.path.join(work_dir, output_prefix + '.isoforms.results'), os.path.join(work_dir, 'rsem_isoform.tab'))
    # Write to FileStore
    ids['rsem_gene.tab'] = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rsem_gene.tab'))
    ids['rsem_isoform.tab'] = job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rsem_isoform.tab'))
    # Run child jobs
    return job.addChildJobFn(rsem_postprocess, job_vars).rv()


def rsem_postprocess(job, job_vars):
    """
    Parses RSEMs output to produce the separate .tab files (TPM, FPKM, counts) for both gene and isoform

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    uuid = input_args['uuid']
    sudo = input_args['sudo']
    # I/O
    return_input_paths(job, work_dir, ids, 'rsem_gene.tab', 'rsem_isoform.tab')
    # Command
    sample = input_args['uuid']
    docker_call(tool='jvivian/rsem_postprocess', tool_parameters=[sample], work_dir=work_dir, sudo=sudo)
    # Tar output files together and store in fileStore
    output_files = ['rsem.genes.norm_counts.tab', 'rsem.genes.raw_counts.tab', 'rsem.genes.norm_fpkm.tab',
                    'rsem.genes.norm_tpm.tab', 'rsem.isoform.norm_counts.tab', 'rsem.isoform.raw_counts.tab',
                    'rsem.isoform.norm_fpkm.tab', 'rsem.isoform.norm_tpm.tab']
    tarball_files(work_dir, tar_name='rsem.tar.gz', uuid=uuid, files=output_files)
    return job.fileStore.writeGlobalFile(os.path.join(work_dir, 'rsem.tar.gz'))


def consolidate_output(job, job_vars, output_ids):
    """
    Combine the contents of separate zipped outputs into one via streaming

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    output_ids: tuple   Nested tuple of all the output fileStore IDs
    """
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    uuid = input_args['uuid']
    # Retrieve IDs
    rseq_id, exon_id, rsem_id = flatten(output_ids)
    # Retrieve output file paths to consolidate
    # map_tar = job.fileStore.readGlobalFile(map_id, os.path.join(work_dir, 'map.tar.gz'))
    qc_tar = job.fileStore.readGlobalFile(rseq_id, os.path.join(work_dir, 'qc.tar.gz'))
    exon_tar = job.fileStore.readGlobalFile(exon_id, os.path.join(work_dir, 'exon.tar.gz'))
    rsem_tar = job.fileStore.readGlobalFile(rsem_id, os.path.join(work_dir, 'rsem.tar.gz'))
    # I/O
    out_tar = os.path.join(work_dir, uuid + '.tar.gz')
    # Consolidate separate tarballs
    with tarfile.open(os.path.join(work_dir, out_tar), 'w:gz') as f_out:
        for tar in [rsem_tar, exon_tar, qc_tar]:
            with tarfile.open(tar, 'r') as f_in:
                for tarinfo in f_in:
                    with closing(f_in.extractfile(tarinfo)) as f_in_file:
                        if tar == qc_tar:
                            tarinfo.name = os.path.join(uuid, 'rseq_qc', os.path.basename(tarinfo.name))
                        else:
                            tarinfo.name = os.path.join(uuid, os.path.basename(tarinfo.name))
                        f_out.addfile(tarinfo, fileobj=f_in_file)
    # Move to output directory of selected
    if input_args['output_dir']:
        output_dir = input_args['output_dir']
        mkdir_p(output_dir)
        copy_to_output_dir(work_dir, output_dir, uuid=None, files=[uuid + '.tar.gz'])
    # Write output file to fileStore
    ids['uuid.tar.gz'] = job.fileStore.writeGlobalFile(out_tar)
    # If S3 bucket argument specified, upload to S3
    if input_args['s3_dir']:
        job.addChildJobFn(upload_output_to_s3, job_vars)


def upload_output_to_s3(job, job_vars):
    """
    If s3_dir is specified in arguments, file will be uploaded to S3 using boto.
    WARNING: ~/.boto credentials are necessary for this to succeed!

    job_vars: tuple     Tuple of dictionaries: input_args and ids
    """
    import boto
    from boto.s3.key import Key

    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    uuid = input_args['uuid']
    # Parse s3_dir
    s3_dir = input_args['s3_dir']
    bucket_name = s3_dir.split('/')[0]
    bucket_dir = '/'.join(s3_dir.split('/')[1:])
    # I/O
    uuid_tar = return_input_paths(job, work_dir, ids, 'uuid.tar.gz')
    # Upload to S3 via boto
    conn = boto.connect_s3()
    bucket = conn.get_bucket(bucket_name)
    k = Key(bucket)
    k.key = os.path.join(bucket_dir, uuid + '.tar.gz')
    k.set_contents_from_filename(uuid_tar)


def upload_bam_to_s3(job, job_vars):
    """
    Upload bam to S3. Requires S3AM and a ~/.boto config file.
    """
    input_args, ids = job_vars
    work_dir = job.fileStore.getLocalTempDir()
    uuid = input_args['uuid']
    # I/O
    job.fileStore.readGlobalFile(ids['alignments.bam'], os.path.join(work_dir, 'alignments.bam'))
    bam_path = os.path.join(work_dir, 'alignments.bam')
    sample_name = uuid + '.bam'
    # Parse s3_dir to get bucket and s3 path
    s3_dir = input_args['s3_dir']
    bucket_name = s3_dir.split('/')[0]
    bucket_dir = os.path.join('/'.join(s3_dir.split('/')[1:]), 'bam_files')
    # Upload to S3 via S3AM
    s3am_command = ['s3am',
                    'upload',
                    'file://{}'.format(bam_path),
                    os.path.join('s3://', bucket_name, bucket_dir, sample_name)]
    subprocess.check_call(s3am_command)


def main():
    """
    This is a Toil pipeline for the UNC best practice RNA-Seq analysis.
    RNA-seq fastqs are combined, aligned, sorted, filtered, and quantified.

    Please read the README.md located in the same directory.
    """
    # Define Parser object and add to toil
    parser = build_parser()
    Job.Runner.addToilOptions(parser)
    args = parser.parse_args()
    # Store inputs from argparse
    inputs = {'config': args.config,
              'config_fastq': args.config_fastq,
              'input': args.input,
              'unc.bed': args.unc,
              'hg19.transcripts.fa': args.fasta,
              'composite_exons.bed': args.composite_exons,
              'normalize.pl': args.normalize,
              'output_dir': args.output_dir,
              'rsem_ref.zip': args.rsem_ref,
              'chromosomes.zip': args.chromosomes,
              'ebwt.zip': args.ebwt,
              'ssec': args.ssec,
              's3_dir': args.s3_dir,
              'sudo': args.sudo,
              'single_end_reads': args.single_end_reads,
              'upload_bam_to_s3': args.upload_bam_to_s3,
              'uuid': None,
              'sample.tar': None,
              'cpu_count': None}

    # Launch jobs
    Job.Runner.startToil(Job.wrapJobFn(download_shared_files, inputs), args)


if __name__ == "__main__":
    main()