python source code of transfer_gtex_to

#!/usr/bin/env python2.7
"""
Toil script to move TCGA data into an S3 bucket.

Dependencies
Curl:       apt-get install curl
Docker:     wget -qO- https://get.docker.com/ | sh
Toil:       pip install toil
S3AM:       pip install --pre s3am
"""
import argparse
import glob
import hashlib
import os
import shutil
import subprocess
import tarfile
from toil.job import Job


def build_parser():
    parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('-s', '--sra', default=None, required=True,
                        help='Path to a file with one analysis ID per line for data hosted on CGHub.')
    parser.add_argument('-k', '--dbgap_key', default=None, required=True,
                        help='Path to a CGHub key that has access to the TCGA data being requested. An exception will'
                             'be thrown if "-g" is set but not this argument.')
    parser.add_argument('--s3_dir', default=None, required=True, help='S3 Bucket. e.g. tcga-data')
    parser.add_argument('--ssec', default=None, required=True, help='Path to Key File for SSE-C Encryption')
    parser.add_argument('--single_end', default=None, action='store_true', help='Set this flag if data is single-end')
    parser.add_argument('--sudo', dest='sudo', default=None, action='store_true',
                        help='Docker usually needs sudo to execute locally, but not when running Mesos or when '
                             'the user is a member of a Docker group.')
    return parser


# Convenience Functions
def generate_unique_key(master_key_path, url):
    """
    master_key_path: str    Path to the BD2K Master Key (for S3 Encryption)
    url: str                S3 URL (e.g. https://s3-us-west-2.amazonaws.com/bucket/file.txt)

    Returns: str            32-byte unique key generated for that URL
    """
    with open(master_key_path, 'r') as f:
        master_key = f.read()
    assert len(master_key) == 32, 'Invalid Key! Must be 32 characters. ' \
                                  'Key: {}, Length: {}'.format(master_key, len(master_key))
    new_key = hashlib.sha256(master_key + url).digest()
    assert len(new_key) == 32, 'New key is invalid and is not 32 characters: {}'.format(new_key)
    return new_key


def docker_call(work_dir, tool_parameters, tool, java_opts=None, sudo=False, outfile=None):
    """
    Makes subprocess call of a command to a docker container.


    tool_parameters: list   An array of the parameters to be passed to the tool
    tool: str               Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools)
    java_opts: str          Optional commands to pass to a java jar execution. (e.g. '-Xmx15G')
    outfile: file           Filehandle that stderr will be passed to
    sudo: bool              If the user wants the docker command executed as sudo
    """
    base_docker_call = 'docker run --log-driver=none --rm -v {}:/data'.format(work_dir).split()
    if sudo:
        base_docker_call = ['sudo'] + base_docker_call
    if java_opts:
        base_docker_call = base_docker_call + ['-e', 'JAVA_OPTS={}'.format(java_opts)]
    try:
        if outfile:
            subprocess.check_call(base_docker_call + [tool] + tool_parameters, stdout=outfile)
        else:
            subprocess.check_call(base_docker_call + [tool] + tool_parameters)
    except subprocess.CalledProcessError:
        raise RuntimeError('docker command returned a non-zero exit status: {}'
                           ''.format(base_docker_call + [tool] + tool_parameters))
    except OSError:
        raise RuntimeError('docker not found on system. Install on all nodes.')


def parse_sra(path_to_config):
    """
    Parses genetorrent config file.  Returns list of samples: [ [id1, id1 ], [id2, id2], ... ]
    Returns duplicate of ids to follow UUID/URL standard.
    """
    samples = []
    with open(path_to_config, 'r') as f:
        for line in f.readlines():
            if not line.isspace():
                samples.append(line.strip())
    return samples


def tarball_files(work_dir, tar_name, uuid=None, files=None):
    """
    Tars a group of files together into a tarball

    work_dir: str       Current Working Directory
    tar_name: str       Name of tarball
    uuid: str           UUID to stamp files with
    files: str(s)       List of filenames to place in the tarball from working directory
    """
    with tarfile.open(os.path.join(work_dir, tar_name), 'w:gz') as f_out:
        for fname in files:
            if uuid:
                f_out.add(os.path.join(work_dir, fname), arcname=uuid + '.' + fname)
            else:
                f_out.add(os.path.join(work_dir, fname), arcname=fname)


# Job Functions
def start_batch(job, input_args):
    """
    This function will administer 5 jobs at a time then recursively call itself until subset is empty
    """
    samples = parse_sra(input_args['sra'])
    # for analysis_id in samples:
    job.addChildJobFn(download_and_transfer_sample, input_args, samples, cores=1, disk='30')


def download_and_transfer_sample(job, input_args, samples):
    """
    Downloads a sample from dbGaP via SRAToolKit, then uses S3AM to transfer it to S3

    input_args: dict        Dictionary of input arguments
    analysis_id: str        An analysis ID for a sample in CGHub
    """
    if len(samples) > 1:
        a = samples[len(samples)/2:]
        b = samples[:len(samples)/2]
        job.addChildJobFn(download_and_transfer_sample, input_args, a, disk='30G')
        job.addChildJobFn(download_and_transfer_sample, input_args, b, disk='30G')
    else:
        analysis_id = samples[0]
        work_dir = job.fileStore.getLocalTempDir()
        sudo = input_args['sudo']
        # Acquire dbgap_key
        shutil.copy(input_args['dbgap_key'], os.path.join(work_dir, 'dbgap.ngc'))
        # Call to fastq-dump to pull down SRA files and convert to fastq
        if input_args['single_end']:
            parameters = [analysis_id]
        else:
            parameters = ['--split-files', analysis_id]
        docker_call(tool='quay.io/ucsc_cgl/fastq-dump:2.5.7--4577a6c1a3c94adaa0c25dd6c03518ee610433d1',
                    work_dir=work_dir, tool_parameters=parameters, sudo=sudo)
        # Collect files and encapsulate into a tarball
        shutil.rmtree(os.path.join(work_dir, 'sra'))
        sample_name = analysis_id + '.tar.gz'
        if input_args['single_end']:
            r = [os.path.basename(x) for x in glob.glob(os.path.join(work_dir, '*.f*'))]
            tarball_files(work_dir, tar_name=sample_name, files=r)
        else:
            r1 = [os.path.basename(x) for x in glob.glob(os.path.join(work_dir, '*_1*'))]
            r2 = [os.path.basename(x) for x in glob.glob(os.path.join(work_dir, '*_2*'))]
            tarball_files(work_dir, tar_name=sample_name, files=r1 + r2)
        # Parse s3_dir to get bucket and s3 path
        key_path = input_args['ssec']
        s3_dir = input_args['s3_dir']
        bucket_name = s3_dir.lstrip('/').split('/')[0]
        base_url = 'https://s3-us-west-2.amazonaws.com/'
        url = os.path.join(base_url, bucket_name, sample_name)
        # Generate keyfile for upload
        with open(os.path.join(work_dir, 'temp.key'), 'wb') as f_out:
            f_out.write(generate_unique_key(key_path, url))
        # Upload to S3 via S3AM
        s3am_command = ['s3am',
                        'upload',
                        '--sse-key-file', os.path.join(work_dir, 'temp.key'),
                        'file://{}'.format(os.path.join(work_dir, sample_name)),
                        's3://' + bucket_name + '/']
        subprocess.check_call(s3am_command)


def main():
    """
    Transfer gTEX data from dbGaP (NCBI) to S3
    """
    # Define Parser object and add to toil
    parser = build_parser()
    Job.Runner.addToilOptions(parser)
    args = parser.parse_args()
    # Store inputs from argparse
    inputs = {'sra': args.sra,
              'dbgap_key': args.dbgap_key,
              'ssec': args.ssec,
              's3_dir': args.s3_dir,
              'single_end': args.single_end,
              'sudo': args.sudo}
    # Sanity checks
    if args.ssec:
        assert os.path.isfile(args.ssec)
    if args.sra:
        assert os.path.isfile(args.sra)
    if args.dbgap_key:
        assert os.path.isfile(args.dbgap_key)
    # Start Pipeline
    Job.Runner.startToil(Job.wrapJobFn(start_batch, inputs), args)

if __name__ == '__main__':
    main()