from __future__ import absolute_import from __future__ import division from __future__ import print_function from six.moves.urllib.request import urlretrieve from six.moves import xrange as range import os import sys import numpy as np url = 'https://catalog.ldc.upenn.edu/desc/addenda/' last_percent_reported = None def download_progress_hook(count, blockSize, totalSize): """A hook to report the progress of a download. This is mostly intended for users with slow internet connections. Reports every 1% change in download progress. """ global last_percent_reported percent = int(count * blockSize * 100 / totalSize) if last_percent_reported != percent: if percent % 5 == 0: sys.stdout.write("%s%%" % percent) sys.stdout.flush() else: sys.stdout.write(".") sys.stdout.flush() last_percent_reported = percent def maybe_download(filename, expected_bytes, force=False): """Download a file if not present, and make sure it's the right size.""" if force or not os.path.exists(filename): print('Attempting to download:', filename) filename, _ = urlretrieve(url + filename, filename, reporthook=download_progress_hook) print('\nDownload Complete!') statinfo = os.stat(filename) if statinfo.st_size == expected_bytes: print('Found and verified', filename) else: raise Exception( 'Failed to verify ' + filename + \ '. Can you get to it with a browser?') return filename def sparse_tuple_from(sequences, dtype=np.int32): """Create a sparse representention of x. Args: sequences: a list of lists of type dtype where each element is a sequence Returns: A tuple with (indices, values, shape) """ indices = [] values = [] for n, seq in enumerate(sequences): indices.extend(zip([n]*len(seq), range(len(seq)))) values.extend(seq) indices = np.asarray(indices, dtype=np.int64) values = np.asarray(values, dtype=dtype) shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64) return indices, values, shape def pad_sequences(sequences, maxlen=None, dtype=np.float32, padding='post', truncating='post', value=0.): '''Pads each sequence to the same length: the length of the longest sequence. If maxlen is provided, any sequence longer than maxlen is truncated to maxlen. Truncation happens off either the beginning or the end (default) of the sequence. Supports post-padding (default) and pre-padding. Args: sequences: list of lists where each element is a sequence maxlen: int, maximum length dtype: type to cast the resulting sequence. padding: 'pre' or 'post', pad either before or after each sequence. truncating: 'pre' or 'post', remove values from sequences larger than maxlen either in the beginning or in the end of the sequence value: float, value to pad the sequences to the desired value. Returns x: numpy array with dimensions (number_of_sequences, maxlen) lengths: numpy array with the original sequence lengths ''' lengths = np.asarray([len(s) for s in sequences], dtype=np.int64) nb_samples = len(sequences) if maxlen is None: maxlen = np.max(lengths) # take the sample shape from the first non empty sequence # checking for consistency in the main loop below. sample_shape = tuple() for s in sequences: if len(s) > 0: sample_shape = np.asarray(s).shape[1:] break x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype) for idx, s in enumerate(sequences): if len(s) == 0: continue # empty list was found if truncating == 'pre': trunc = s[-maxlen:] elif truncating == 'post': trunc = s[:maxlen] else: raise ValueError('Truncating type "%s" not understood' % truncating) # check `trunc` has expected shape trunc = np.asarray(trunc, dtype=dtype) if trunc.shape[1:] != sample_shape: raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' % (trunc.shape[1:], idx, sample_shape)) if padding == 'post': x[idx, :len(trunc)] = trunc elif padding == 'pre': x[idx, -len(trunc):] = trunc else: raise ValueError('Padding type "%s" not understood' % padding) return x, lengths