""" Downloading datasets: utility functions This is a copy of nilearn.datasets. """ import os import numpy as np import base64 import collections import contextlib import fnmatch import hashlib import shutil import tempfile import time import sys import tarfile import warnings import zipfile import glob import pandas as pd from tqdm import tqdm from sklearn.datasets.base import Bunch from .._utils.compat import _basestring, cPickle, _urllib, md5_hash TEMP = tempfile.gettempdir() def _get_cluster_assignments(dataset_name, url, sep=" ", skip_header=False): data_dir = _get_dataset_dir("categorization", verbose=0) _fetch_file(url=url, data_dir=data_dir, uncompress=True, move="{0}/{0}.txt".format(dataset_name), verbose=0) files = glob.glob(os.path.join(data_dir, dataset_name + "/*.txt")) X = [] y = [] names = [] for cluster_id, file_name in enumerate(files): with open(file_name) as f: lines = f.read().splitlines()[(int(skip_header)):] X += [l.split(sep) for l in lines] y += [os.path.basename(file_name).split(".")[0]] * len(lines) return Bunch(X=np.array(X, dtype="object"), y=np.array(y).astype("object")) def _get_as_pd(url, dataset_name, **read_csv_kwargs): return pd.read_csv(_fetch_file(url, dataset_name, verbose=0), **read_csv_kwargs) def _change_list_to_np(dict): return {k: np.array(dict[k], dtype="object") for k in dict} def _format_time(t): if t > 60: return "%4.1fmin" % (t / 60.) else: return " %5.1fs" % (t) def _md5_sum_file(path): """ Calculates the MD5 sum of a file. """ with open(path, 'rb') as f: m = hashlib.md5() while True: data = f.read(8192) if not data: break m.update(data) return m.hexdigest() def _read_md5_sum_file(path): """ Reads a MD5 checksum file and returns hashes as a dictionary. """ with open(path, "r") as f: hashes = {} while True: line = f.readline() if not line: break h, name = line.rstrip().split(' ', 1) hashes[name] = h return hashes def readlinkabs(link): """ Return an absolute path for the destination of a symlink """ path = os.readlink(link) if os.path.isabs(path): return path return os.path.join(os.path.dirname(link), path) def _chunk_report_(bytes_so_far, total_size, initial_size, t0): """Show downloading percentage. Parameters ---------- bytes_so_far: int Number of downloaded bytes total_size: int Total size of the file (may be 0/None, depending on download method). t0: int The time in seconds (as returned by time.time()) at which the download was resumed / started. initial_size: int If resuming, indicate the initial size of the file. If not resuming, set to zero. """ if not total_size: sys.stderr.write("Downloaded %d of ? bytes\r" % (bytes_so_far)) else: # Estimate remaining download time total_percent = float(bytes_so_far) / total_size current_download_size = bytes_so_far - initial_size bytes_remaining = total_size - bytes_so_far dt = time.time() - t0 download_rate = current_download_size / max(1e-8, float(dt)) # Minimum rate of 0.01 bytes/s, to avoid dividing by zero. time_remaining = bytes_remaining / max(0.01, download_rate) # Trailing whitespace is to erase extra char when message length # varies sys.stderr.write( "Downloaded %d of %d bytes (%0.2f%%, %s remaining) \r" % (bytes_so_far, total_size, total_percent * 100, _format_time(time_remaining))) def _chunk_read_(response, local_file, chunk_size=8192, report_hook=None, initial_size=0, total_size=None, verbose=1): """Download a file chunk by chunk and show advancement Parameters ---------- response: _urllib.response.addinfourl Response to the download request in order to get file size local_file: file Hard disk file where data should be written chunk_size: int, optional Size of downloaded chunks. Default: 8192 report_hook: bool Whether or not to show downloading advancement. Default: None initial_size: int, optional If resuming, indicate the initial size of the file total_size: int, optional Expected final size of download (None means it is unknown). verbose: int, optional verbosity level (0 means no message). Returns ------- data: string The downloaded file. """ try: if total_size is None: total_size = response.info().get('Content-Length').strip() total_size = int(total_size) + initial_size except Exception as e: if verbose > 1: print("Warning: total size could not be determined.") if verbose > 2: print("Full stack trace: %s" % e) total_size = None bytes_so_far = initial_size # t0 = time.time() if report_hook: pbar = tqdm(total=total_size, unit="b", unit_scale=True) while True: chunk = response.read(chunk_size) bytes_so_far += len(chunk) if not chunk: if report_hook: # sys.stderr.write('\n') pbar.close() break local_file.write(chunk) if report_hook: pbar.update(len(chunk)) # This is better because works in ipython # _chunk_report_(bytes_so_far, total_size, initial_size, t0) if report_hook: pbar.close() return def _get_dataset_dir(sub_dir=None, data_dir=None, default_paths=None, verbose=1): """ Create if necessary and returns data directory of given dataset. Parameters ---------- sub_dir: string Name of sub-dir data_dir: string, optional Path of the data directory. Used to force data storage in a specified location. Default: None default_paths: list of string, optional Default system paths in which the dataset may already have been installed by a third party software. They will be checked first. verbose: int, optional verbosity level (0 means no message). Returns ------- data_dir: string Path of the given dataset directory. Notes ----- This function retrieves the datasets directory (or data directory) using the following priority : 1. defaults system paths 2. the keyword argument data_dir 3. the global environment variable WEB_SHARED_DATA 4. the user environment variable WEB_DATA 5. web_data in the user home folder """ # We build an array of successive paths by priority # The boolean indicates if it is a pre_dir: in that case, we won't add the # dataset name to the path. paths = [] # Search given environment variables if default_paths is not None: for default_path in default_paths: paths.extend([(d, True) for d in default_path.split(':')]) # Check data_dir which force storage in a specific location if data_dir is not None: paths.extend([(d, False) for d in data_dir.split(':')]) else: global_data = os.getenv('WEB_SHARED_DATA') if global_data is not None: paths.extend([(d, False) for d in global_data.split(':')]) local_data = os.getenv('WEB_DATA') if local_data is not None: paths.extend([(d, False) for d in local_data.split(':')]) paths.append((os.path.expanduser('~/web_data'), False)) if verbose > 2: print('Dataset search paths: %s' % paths) # Check if the dataset exists somewhere for path, is_pre_dir in paths: if not is_pre_dir and sub_dir: path = os.path.join(path, sub_dir) if os.path.islink(path): # Resolve path path = readlinkabs(path) if os.path.exists(path) and os.path.isdir(path): if verbose > 1: print('\nDataset found in %s\n' % path) return path # If not, create a folder in the first writeable directory errors = [] for (path, is_pre_dir) in paths: if not is_pre_dir and sub_dir: path = os.path.join(path, sub_dir) if not os.path.exists(path): try: os.makedirs(path) if verbose > 0: print('\nDataset created in %s\n' % path) return path except Exception as exc: short_error_message = getattr(exc, 'strerror', str(exc)) errors.append('\n -{0} ({1})'.format( path, short_error_message)) raise OSError('Web tried to store the dataset in the following ' 'directories, but:' + ''.join(errors)) def _uncompress_file(file_, delete_archive=True, verbose=1): """Uncompress files contained in a data_set. Parameters ---------- file: string path of file to be uncompressed. delete_archive: bool, optional Wheteher or not to delete archive once it is uncompressed. Default: True verbose: int, optional verbosity level (0 means no message). Notes ----- This handles zip, tar, gzip and bzip files only. """ if verbose > 0: print('Extracting data from %s...' % file_) data_dir = os.path.dirname(file_) # We first try to see if it is a zip file try: filename, ext = os.path.splitext(file_) with open(file_, "rb") as fd: header = fd.read(4) processed = False if zipfile.is_zipfile(file_): z = zipfile.ZipFile(file_) z.extractall(data_dir) z.close() processed = True elif ext == '.gz' or header.startswith(b'\x1f\x8b'): import gzip gz = gzip.open(file_) if ext == '.tgz': filename = filename + '.tar' out = open(filename, 'wb') shutil.copyfileobj(gz, out, 8192) gz.close() out.close() # If file is .tar.gz, this will be handle in the next case if delete_archive: os.remove(file_) file_ = filename filename, ext = os.path.splitext(file_) processed = True if tarfile.is_tarfile(file_): with contextlib.closing(tarfile.open(file_, "r")) as tar: tar.extractall(path=data_dir) processed = True if not processed: raise IOError( "[Uncompress] unknown archive file format: %s" % file_) if delete_archive: os.remove(file_) if verbose > 0: print(' ...done.') except Exception as e: if verbose > 0: print('Error uncompressing file: %s' % e) raise def _filter_column(array, col, criteria): """ Return index array matching criteria Parameters ---------- array: numpy array with columns Array in which data will be filtered col: string Name of the column criteria: integer (or float), pair of integers, string or list of these if integer, select elements in column matching integer if a tuple, select elements between the limits given by the tuple if a string, select elements that match the string """ # Raise an error if the column does not exist. This is the only way to # test it across all possible types (pandas, recarray...) try: array[col] except: raise KeyError('Filtering criterion %s does not exist' % col) if (not isinstance(criteria, _basestring) and not isinstance(criteria, bytes) and not isinstance(criteria, tuple) and isinstance(criteria, collections.Iterable)): filter = np.zeros(array.shape[0], dtype=np.bool) for criterion in criteria: filter = np.logical_or(filter, _filter_column(array, col, criterion)) return filter if isinstance(criteria, tuple): if len(criteria) != 2: raise ValueError("An interval must have 2 values") if criteria[0] is None: return array[col] <= criteria[1] if criteria[1] is None: return array[col] >= criteria[0] filter = array[col] <= criteria[1] return np.logical_and(filter, array[col] >= criteria[0]) return array[col] == criteria def _filter_columns(array, filters, combination='and'): """ Return indices of recarray entries that match criteria. Parameters ---------- array: numpy array with columns Array in which data will be filtered filters: list of criteria See _filter_column combination: string, optional String describing the combination operator. Possible values are "and" and "or". """ if combination == 'and': fcomb = np.logical_and mask = np.ones(array.shape[0], dtype=np.bool) elif combination == 'or': fcomb = np.logical_or mask = np.zeros(array.shape[0], dtype=np.bool) else: raise ValueError('Combination mode not known: %s' % combination) for column in filters: mask = fcomb(mask, _filter_column(array, column, filters[column])) return mask def _get_dataset_descr(ds_name): module_path = os.path.dirname(os.path.abspath(__file__)) fname = ds_name try: with open(os.path.join(module_path, 'description', fname + '.rst'))\ as rst_file: descr = rst_file.read() except IOError: descr = '' if descr == '': print("Warning: Could not find dataset description.") return descr def movetree(src, dst): """Move an entire tree to another directory. Any existing file is overwritten""" names = os.listdir(src) # Create destination dir if it does not exist if not os.path.exists(dst): os.makedirs(dst) errors = [] for name in names: srcname = os.path.join(src, name) dstname = os.path.join(dst, name) try: if os.path.isdir(srcname) and os.path.isdir(dstname): movetree(srcname, dstname) os.rmdir(srcname) else: shutil.move(srcname, dstname) except (IOError, os.error) as why: errors.append((srcname, dstname, str(why))) # catch the Error from the recursive movetree so that we can # continue with other files except Exception as err: errors.extend(err.args[0]) if errors: raise Exception(errors) # TODO: refactor, this function is a mess, it was adapted from other project # and it might have not been an optimal choice def _fetch_file(url, data_dir=TEMP, uncompress=False, move=False,md5sum=None, username=None, password=None, mock=False, handlers=[], resume=True, verbose=0): """Load requested dataset, downloading it if needed or requested. This function retrieves files from the hard drive or download them from the given urls. Note to developpers: All the files will be first downloaded in a sandbox and, if everything goes well, they will be moved into the folder of the dataset. This prevents corrupting previously downloaded data. In case of a big dataset, do not hesitate to make several calls if needed. Parameters ---------- dataset_name: string Unique dataset name resume: bool, optional If true, try to resume partially downloaded files uncompress: bool, optional If true, will uncompress zip move: str, optional If True, will move downloaded file to given relative path. NOTE: common usage is zip_file_id/zip_file.zip together with uncompress set to True md5sum: string, optional MD5 sum of the file. Checked if download of the file is required username: string, optional Username used for basic HTTP authentication password: string, optional Password used for basic HTTP authentication handlers: list of BaseHandler, optional urllib handlers passed to urllib.request.build_opener. Used by advanced users to customize request handling. data_dir: string, optional Path of the data directory. Used to force data storage in a specified location. Default: None resume: bool, optional If true, try resuming download if possible verbose: int, optional verbosity level (0 means no message). Returns ------- files: list of string Absolute paths of downloaded files on disk """ # TODO: move to global scope and rename def _fetch_helper(url, data_dir=TEMP, resume=True, overwrite=False, md5sum=None, username=None, password=None, handlers=[], verbose=1): if not os.path.isabs(data_dir): data_dir = _get_dataset_dir(data_dir) # Determine data path if not os.path.exists(data_dir): os.makedirs(data_dir) # Determine filename using URL parse = _urllib.parse.urlparse(url) file_name = os.path.basename(parse.path) if file_name == '': file_name = md5_hash(parse.path) temp_file_name = file_name + ".part" full_name = os.path.join(data_dir, file_name) temp_full_name = os.path.join(data_dir, temp_file_name) if os.path.exists(full_name): if overwrite: os.remove(full_name) else: return full_name if os.path.exists(temp_full_name): if overwrite: os.remove(temp_full_name) t0 = time.time() local_file = None initial_size = 0 try: # Download data url_opener = _urllib.request.build_opener(*handlers) request = _urllib.request.Request(url) request.add_header('Connection', 'Keep-Alive') if username is not None and password is not None: if not url.startswith('https'): raise ValueError( 'Authentication was requested on a non secured URL (%s).' 'Request has been blocked for security reasons.' % url) # Note: HTTPBasicAuthHandler is not fitted here because it relies # on the fact that the server will return a 401 error with proper # www-authentication header, which is not the case of most # servers. encoded_auth = base64.b64encode( (username + ':' + password).encode()) request.add_header(b'Authorization', b'Basic ' + encoded_auth) if verbose > 0: displayed_url = url.split('?')[0] if verbose == 1 else url print('Downloading data from %s ...' % displayed_url) if resume and os.path.exists(temp_full_name): # Download has been interrupted, we try to resume it. local_file_size = os.path.getsize(temp_full_name) # If the file exists, then only download the remainder request.add_header("Range", "bytes=%s-" % (local_file_size)) try: data = url_opener.open(request) content_range = data.info().get('Content-Range') if (content_range is None or not content_range.startswith( 'bytes %s-' % local_file_size)): raise IOError('Server does not support resuming') except Exception: # A wide number of errors can be raised here. HTTPError, # URLError... I prefer to catch them all and rerun without # resuming. if verbose > 0: print('Resuming failed, try to download the whole file.') return _fetch_helper( url, data_dir, resume=False, overwrite=overwrite, md5sum=md5sum, username=username, password=password, handlers=handlers, verbose=verbose) local_file = open(temp_full_name, "ab") initial_size = local_file_size else: data = url_opener.open(request) local_file = open(temp_full_name, "wb") _chunk_read_(data, local_file, report_hook=(verbose > 0), initial_size=initial_size, verbose=verbose) # temp file must be closed prior to the move if not local_file.closed: local_file.close() shutil.move(temp_full_name, full_name) dt = time.time() - t0 if verbose > 0: print('...done. (%i seconds, %i min)' % (dt, dt // 60)) except _urllib.error.HTTPError as e: if verbose > 0: print('Error while fetching file %s. Dataset fetching aborted.' % (file_name)) if verbose > 1: print("HTTP Error: %s, %s" % (e, url)) raise except _urllib.error.URLError as e: if verbose > 0: print('Error while fetching file %s. Dataset fetching aborted.' % (file_name)) if verbose > 1: print("URL Error: %s, %s" % (e, url)) raise finally: if local_file is not None: if not local_file.closed: local_file.close() if md5sum is not None: if (_md5_sum_file(full_name) != md5sum): raise ValueError("File %s checksum verification has failed." " Dataset fetching aborted." % local_file) return full_name if not os.path.isabs(data_dir): data_dir = _get_dataset_dir(data_dir) # There are two working directories here: # - data_dir is the destination directory of the dataset # - temp_dir is a temporary directory dedicated to this fetching call. All # files that must be downloaded will be in this directory. If a corrupted # file is found, or a file is missing, this working directory will be # deleted. parse = _urllib.parse.urlparse(url) file_name = os.path.basename(parse.path) files_pickle = cPickle.dumps([(file_, url) for file_, url in zip([file_name], [url])]) files_md5 = hashlib.md5(files_pickle).hexdigest() temp_dir = os.path.join(data_dir, files_md5) # Create destination dir if not os.path.exists(data_dir): os.makedirs(data_dir) # Abortion flag, in case of error abort = None # 2 possibilities: # - the file exists in data_dir, nothing to do (we have to account for move parameter here) # - the file does not exists: we download it in temp_dir # Target file in the data_dir target_file = os.path.join(data_dir, file_name) # Change move so we always uncompress to some folder (this is important for # detecting already downloaded files) # Ex. glove.4B.zip -> glove.4B/glove.4B.zip if uncompress and not move: dirname, _ = os.path.splitext(file_name) move = os.path.join(dirname, os.path.basename(file_name)) if (abort is None and not os.path.exists(target_file) and (not move or (move and uncompress and not os.path.exists(os.path.dirname(os.path.join(data_dir, move))))) or (move and not uncompress and not os.path.exists(os.path.join(data_dir, move)))): # Target file in temp dir temp_target_file = os.path.join(temp_dir, file_name) # We may be in a global read-only repository. If so, we cannot # download files. if not os.access(data_dir, os.W_OK): raise ValueError('Dataset files are missing but dataset' ' repository is read-only. Contact your data' ' administrator to solve the problem') if not os.path.exists(temp_dir): os.mkdir(temp_dir) dl_file = _fetch_helper(url, temp_dir, resume=resume, verbose=verbose, md5sum=md5sum, username=username, password=password, handlers=handlers) if (abort is None and not os.path.exists(target_file) and not os.path.exists(temp_target_file)): if not mock: warnings.warn('An error occured while fetching %s' % file_) abort = ("Dataset has been downloaded but requested file was " "not provided:\nURL:%s\nFile:%s" % (url, target_file)) else: if not os.path.exists(os.path.dirname(temp_target_file)): os.makedirs(os.path.dirname(temp_target_file)) open(temp_target_file, 'w').close() if move: move = os.path.join(data_dir, move) move_dir = os.path.dirname(move) if not os.path.exists(move_dir): os.makedirs(move_dir) shutil.move(dl_file, move) dl_file = move target_file = dl_file if uncompress: try: if os.path.getsize(dl_file) != 0: _uncompress_file(dl_file, verbose=verbose) else: os.remove(dl_file) target_file = os.path.dirname(target_file) except Exception as e: abort = str(e) else: if verbose > 0: print("File already downloaded, skipping") if move: target_file = os.path.join(data_dir, move) if uncompress: target_file = os.path.dirname(target_file) if abort is not None: if os.path.exists(temp_dir): shutil.rmtree(temp_dir) raise IOError('Fetching aborted: ' + abort) # If needed, move files from temps directory to final directory. if os.path.exists(temp_dir): # XXX We could only moved the files requested # XXX Movetree can go wrong movetree(temp_dir, data_dir) shutil.rmtree(temp_dir) return target_file def _tree(path, pattern=None, dictionary=False): """ Return a directory tree under the form of a dictionaries and list Parameters: ----------- path: string Path browsed pattern: string, optional Pattern used to filter files (see fnmatch) dictionary: boolean, optional If True, the function will return a dict instead of a list """ files = [] dirs = [] if not dictionary else {} for file_ in os.listdir(path): file_path = os.path.join(path, file_) if os.path.isdir(file_path): if not dictionary: dirs.append((file_, _tree(file_path, pattern))) else: dirs[file_] = _tree(file_path, pattern) else: if pattern is None or fnmatch.fnmatch(file_, pattern): files.append(file_path) files = sorted(files) if not dictionary: return sorted(dirs) + files if len(dirs) == 0: return files if len(files) > 0: dirs['.'] = files return dirs