python source code of download

'''
Download Datasets for DeepLab

Lei Mao
Department of Computer Science
University of Chicago
dukeleimao@gmail.com

Shengjie Lin
Toyota Technological Institute at Chicago
slin@ttic.edu
'''

import argparse
import os
import tarfile
import zipfile

import requests

from tqdm import tqdm
from utils import static_vars


def _download(sess, url, destination_dir, filename, expected_bytes, force):
    r = sess.get(url, stream=True)
    if not filename:
        if 'Content-Disposition' in r.headers:
            filename = r.headers['Content-Disposition'].split('filename=')[1]
            if filename[0] in {"'", '"'}:
                filename = filename[1:-1]
        else:
            filename = url.split('/')[-1]
    filepath = os.path.join(destination_dir, filename)
    if not os.path.exists(filepath) or expected_bytes and os.stat(filepath).st_size != expected_bytes or force:
        if not os.path.exists(destination_dir):
            os.makedirs(destination_dir)
        print('Downloading: ' + filename)
        chunk_size = 8192
        total_size = int(r.headers.get('Content-Length', 0))
        pbar = tqdm(total=total_size, unit='B', unit_scale=True, unit_divisor=1024, desc=filename)
        with open(os.path.join(destination_dir, filename), 'wb') as f:
            for chunk in r.iter_content(chunk_size=chunk_size):
                f.write(chunk)
                pbar.update(len(chunk))
        pbar.close()
        print('Download complete!')
        actual_bytes = os.stat(filepath).st_size
        if expected_bytes:
            if actual_bytes == expected_bytes:
                print('Verified: ' + filename)
            else:
                print('Failed to verify: ' + filename + '. File size does not match!')
        else:
            print('Fize size: ' + str(actual_bytes))
    else:
        if expected_bytes:
            print('Found and verified: ' + filename)
        else:
            print('Found: ' + filename)
    return filepath


def download(urls, destination_dir, filenames=None, expected_bytes=None, login_dict=None, force=False):
    with requests.Session() as sess:
        if login_dict:
            sess.post(login_dict['url'], data=login_dict['payload'])
        if isinstance(urls, str):
            return _download(sess, urls, destination_dir, filenames, expected_bytes, force)
        n_urls = len(urls)
        if filenames:
            assert not isinstance(filenames, str) and len(filenames) == n_urls, 'number of filenames does not match that of urls'
        else:
            filenames = [None] * n_urls
        if expected_bytes:
            assert len(expected_bytes) == n_urls, 'number of expected_bytes does not match that of urls'
        else:
            expected_bytes = [None] * n_urls
        return [_download(sess, url, destination_dir, filename, expected_byte, force) for url, filename, expected_byte in zip(urls, filenames, expected_bytes)]


@static_vars(utils_dict={'zip': (zipfile.ZipFile, 'namelist'), 'tar': (tarfile.open, 'getnames')})
def extract(archive_filepath, archive_type, destination_dir, force=False):

    print('Extracting {} file: {}'.format(archive_type, os.path.split(archive_filepath)[-1]))
    utils = extract.utils_dict[archive_type]
    with utils[0](archive_filepath) as archive:
        for name in getattr(archive, utils[1])():
            if not os.path.exists(os.path.join(destination_dir, name)) or force:
                archive.extract(name, path=destination_dir)
    print('Extraction complete!')


def download_voc2012(downloads_dir='data/downloads/', data_dir='data/datasets/', force=False):
    url = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar'
    filepath = download(url, downloads_dir, force=force)
    extract(filepath, 'tar', data_dir, force=force)


def download_sbd(downloads_dir='data/downloads/', data_dir='data/datasets/SBD/', force=False):
    '''
    http://home.bharathh.info/pubs/codes/SBD/download.html
    '''

    url = 'http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz'
    train_noval_url = 'http://home.bharathh.info/pubs/codes/SBD/train_noval.txt'

    filepath = download(url, downloads_dir, filenames='SBD.tgz', force=force)
    download(train_noval_url, data_dir, force=force)

    extract(filepath, 'tar', data_dir, force=force)


def download_cityscapes(downloads_dir='data/downloads/cityscapes/', data_dir='data/datasets/cityscapes/', force=False):
    '''
    Does basically the same thing as following bash commands:
    curl -c cs_cookies -d "username=StArchon&password=eUpMJjMW4mbEUjZ&submit=Login" https://www.cityscapes-dataset.com/login/
    curl -b cs_cookies -JLO https://www.cityscapes-dataset.com/file-handling/?packageID=1
    '''

    urls = ['https://www.cityscapes-dataset.com/file-handling/?packageID={}'.format(id) for id in [1, 2, 3, 4, 10, 11]]
    login_dict = {'url': 'https://www.cityscapes-dataset.com/login/', 'payload': {'username': 'StArchon', 'password': 'eUpMJjMW4mbEUjZ', 'submit': 'Login'}}

    filepaths = download(urls, downloads_dir, login_dict=login_dict, force=force)
    for filepath in filepaths:
        extract(filepath, 'zip', data_dir, force=force)


def download_pretrained_models(models, downloads_dir='data/downloads/', model_dir='data/models/pretrained/', force=False):
    '''
    Download ImageNet pre-trained models
    https://github.com/tensorflow/models/tree/master/research/slim
    ResNet-50: [224, 224, 3]
    ResNet-101: [513, 513, 3]
    '''

    urls = {
        'resnet_50': 'http://download.tensorflow.org/models/resnet_v2_50_2017_04_14.tar.gz',
        'resnet_101': 'http://download.tensorflow.org/models/resnet_v2_101_2017_04_14.tar.gz',
        'mobilenet_1.0_224': 'https://storage.googleapis.com/mobilenet_v2/checkpoints/mobilenet_v2_1.0_224.tgz'
    }

    for model in models:
        print('Downloading pretrained {}'.format(model))
        filepath = download(urls[model], downloads_dir, force=force)
        extract(filepath, 'tar', os.path.join(model_dir, model), force=force)


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Download DeepLab semantic segmentation datasets and pretrained backbone models.')

    downloads_dir_default = 'data/downloads/'
    data_dir_default = 'data/datasets/'
    pretrained_models_dir_default = 'data/models/pretrained/'
    pretrained_models_default = ['resnet_50', 'resnet_101', 'mobilenet_1.0_224']

    parser.add_argument('--downloads_dir', type=str, help='Downloads directory', default=downloads_dir_default)
    parser.add_argument('--data_dir', type=str, help='Data directory', default=data_dir_default)
    parser.add_argument('--pretrained_models_dir', type=str, help='Pretrained models directory', default=pretrained_models_dir_default)
    parser.add_argument('--pretrained_models', type=str, nargs='+', help='Pretrained models to download: resnet_50, resnet_101, mobilenet_1.0_224', default=pretrained_models_default)
    parser.add_argument('--force', help='force downloading and extracting files that are already present', default=False, action='store_true')

    argv = parser.parse_args()

    downloads_dir = argv.downloads_dir
    data_dir = argv.data_dir
    pretrained_models_dir = argv.pretrained_models_dir
    pretrained_models = argv.pretrained_models
    force = argv.force

    print('Downloading datasets...')
    download_voc2012(downloads_dir=downloads_dir, data_dir=data_dir, force=force)
    download_sbd(downloads_dir=downloads_dir, data_dir=os.path.join(data_dir, 'SBD'), force=force)
    #download_cityscapes(downloads_dir=os.path.join(downloads_dir, 'cityscapes/'), data_dir=os.path.join(data_dir, 'cityscapes'), force=force)
    print('Downloading pre-trained models...')
    download_pretrained_models(models=pretrained_models, downloads_dir=downloads_dir, model_dir=pretrained_models_dir, force=force)