python source code of transfer_tcga_to

#!/usr/bin/env python2.7
"""
Toil script to move TCGA data into an S3 bucket.

Dependencies
Curl:       apt-get install curl
Docker:     wget -qO- https://get.docker.com/ | sh
Toil:       pip install toil
S3AM:       pip install --pre s3am
"""
import argparse
import glob
import hashlib
import os
import shutil
import subprocess
from toil.job import Job
from toil_lib.jobs import map_job
from toil_lib.programs import docker_call


def build_parser():
    parser = argparse.ArgumentParser(description=main.__doc__, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('-g', '--genetorrent', default=None, required=True,
                        help='Path to a file with one analysis ID per line for data hosted on CGHub.')
    parser.add_argument('-k', '--genetorrent_key', default=None, required=True,
                        help='Path to a CGHub key that has access to the TCGA data being requested. An exception will'
                             'be thrown if "-g" is set but not this argument.')
    parser.add_argument('--s3_dir', default=None, required=True, help='S3 Bucket. e.g. tcga-data')
    parser.add_argument('--ssec', default=None, required=True, help='Path to Key File for SSE-C Encryption')
    return parser


# Convenience Functions
def generate_unique_key(master_key_path, url):
    """
    master_key_path: str    Path to the 32-byte Master Key (for S3 Encryption)
    url: str                S3 URL (e.g. https://s3-us-west-2.amazonaws.com/bucket/file.txt)

    Returns: str            32-byte unique key generated for that URL
    """
    with open(master_key_path, 'r') as f:
        master_key = f.read()
    assert len(master_key) == 32, 'Invalid Key! Must be 32 characters. ' \
                                  'Key: {}, Length: {}'.format(master_key, len(master_key))
    new_key = hashlib.sha256(master_key + url).digest()
    assert len(new_key) == 32, 'New key is invalid and is not 32 characters: {}'.format(new_key)
    return new_key


def parse_genetorrent(path_to_config):
    """
    Parses genetorrent config file.  Returns list of samples: [ [id1, id1 ], [id2, id2], ... ]
    Returns duplicate of ids to follow UUID/URL standard.
    """
    samples = []
    with open(path_to_config, 'r') as f:
        for line in f.readlines():
            if not line.isspace():
                samples.append(line.strip())
    return samples


# Job Functions
def download_and_transfer_sample(job, sample, inputs):

    """
    Downloads a sample from CGHub via GeneTorrent, then uses S3AM to transfer it to S3

    input_args: dict        Dictionary of input arguments
    analysis_id: str        An analysis ID for a sample in CGHub
    """
    analysis_id = sample[0]
    work_dir = job.fileStore.getLocalTempDir()
    folder_path = os.path.join(work_dir, os.path.basename(analysis_id))
    # Acquire genetorrent key and download sample
    shutil.copy(inputs['genetorrent_key'], os.path.join(work_dir, 'cghub.key'))
    parameters = ['-vv', '-c', 'cghub.key', '-d', analysis_id]
    docker_call(job=job, tool='quay.io/ucsc_cgl/genetorrent:3.8.7--9911761265b6f08bc3ef09f53af05f56848d805b',
                work_dir=work_dir, parameters=parameters)
    try:
        sample = glob.glob(os.path.join(folder_path, '*tar*'))[0]
    except KeyError as e:
        print 'No tarfile found inside of folder: '.format(e)
        raise
    # Upload sample to S3AM
    key_path = inputs['ssec']
    if sample.endswith('gz'):
        sample_name = analysis_id + '.tar.gz'
        shutil.move(sample, os.path.join(work_dir, sample_name))
    else:
        sample_name = analysis_id + '.tar'
        shutil.move(sample, os.path.join(work_dir, sample_name))
    # Parse s3_dir to get bucket and s3 path
    s3_dir = inputs['s3_dir']
    bucket_name = s3_dir.lstrip('/').split('/')[0]
    base_url = 'https://s3-us-west-2.amazonaws.com/'
    url = os.path.join(base_url, bucket_name, sample_name)
    # Generate keyfile for upload
    with open(os.path.join(work_dir, 'temp.key'), 'wb') as f_out:
        f_out.write(generate_unique_key(key_path, url))
    # Upload to S3 via S3AM
    s3am_command = ['s3am',
                    'upload',
                    '--sse-key-file', os.path.join(work_dir, 'temp.key'),
                    'file://{}'.format(os.path.join(work_dir, sample_name)),
                    's3://' + bucket_name + '/']
    subprocess.check_call(s3am_command)


def main():
    """
    This is a Toil pipeline to transfer TCGA data into an S3 Bucket

    Data is pulled down with Genetorrent and transferred to S3 via S3AM.
    """
    # Define Parser object and add to toil
    parser = build_parser()
    Job.Runner.addToilOptions(parser)
    args = parser.parse_args()
    # Store inputs from argparse
    inputs = {'genetorrent': args.genetorrent,
              'genetorrent_key': args.genetorrent_key,
              'ssec': args.ssec,
              's3_dir': args.s3_dir}
    # Sanity checks
    if args.ssec:
        assert os.path.isfile(args.ssec)
    if args.genetorrent:
        assert os.path.isfile(args.genetorrent)
    if args.genetorrent_key:
        assert os.path.isfile(args.genetorrent_key)
    samples = parse_genetorrent(args.genetorrent)
    # Start pipeline
    # map_job accepts a function, an iterable, and *args. The function is launched as a child
    # process with one element from the iterable and *args, which in turn spawns a tree of child jobs.
    Job.Runner.startToil(Job.wrapJobFn(map_job, download_and_transfer_sample, samples, inputs), args)


if __name__ == '__main__':
    main()