#!/usr/bin/env python3 """Downloader of sample audio data Configuration are in the directory downloder_conf. Usage: download_speech_corpus.py <config> [-h] [-q] [-f] [-m] Parameters: <config> The path of configuration file -h, --help Show this help and exit -q, --quiet Don't show any messages about progress -f, --force Overwrite existing corpus files """ import os import re import shutil import urllib.parse import urllib.request from fnmatch import fnmatchcase from pathlib import Path from tempfile import TemporaryDirectory import yaml from docopt import docopt class UserOption: """ Class (structure) that contains flags given in Parameters - Parameters ---------- expected : str string that expresses the pattern """ def __init__(self, verbose=True, force=False): self.verbose = verbose self.force = force class FixedStrPattern: """ Class of fixed-string pattern Uses duck-typing to commonalize interface `match` for a time. Parameters ---------- expected : str string that expresses the pattern """ def __init__(self, expected): self.expected = expected def match(self, target): """ Check the given string matches the pattern. """ return target == self.expected def __repr__(self): return "{}({})".format(self.__class__.__name__, repr(self.expected)) class RegExPattern: """ Class of regular expression pattern Parameters ---------- expected : str string that expresses the pattern """ def __init__(self, expected): self.expected = re.compile(expected) def match(self, target): # SREmatch() return bool(self.expected.match(target)) def __repr__(self): return "{}({})".format( self.__class__.__name__, repr(self.expected.pattern) ) class GlobPattern(FixedStrPattern): """ Class of glob pattern Like: SF* MF* Parameters ---------- expected : str string that expresses the pattern """ def match(self, target): return fnmatchcase(target, self.expected) def generate_pattern_from_obj(pattern_obj): """ Generates an appropriate pattern from object from YAML. Parameters ---------- pattern_obj : Union[str, Dict[str, str]] Parsed YAML object. Only one object indicated by these expressions is allowed. are allowed:: pattern -> fixed string regex: pattern regexp: pattern -> regexp glob: pattern -> glob Returns ------- pattern Generated pattern instance. """ if isinstance(pattern_obj, str): return FixedStrPattern(pattern_obj) elif isinstance(pattern_obj, dict) and len(pattern_obj) == 1: pattern_type, pattern = next(iter(pattern_obj.items())) # first item if pattern_type in {"regex", "regexp"}: return RegExPattern(pattern) elif pattern_type == "glob": return GlobPattern(pattern) else: raise ValueError("Unknown pattern type: `{}`.".format(pattern)) else: raise ValueError("Pattern object must be str or dictionary w/ 1 item.") class PatternList: """ List class of patterns that provides the method `match`. Parameters ---------- patterns : List[Pattern] List of patterns """ def __init__(self, patterns): self.patterns = ( patterns if isinstance(patterns, list) else list(patterns) ) @classmethod def from_obj(cls, patterns_obj): """ Constructs instance from object from YAML. objects must be one of like:: foo regex: foo - foo - glob: bar - regexp: baz This is used to express `only` or `except` clauses. Parameters ---------- patterns_obj : Union[str, Dict[str, str], Iterable[Union[str, Dict[str, str]]]] object generated by parsing YAML """ # foo or [pattern type]: foo if isinstance(patterns_obj, (str, dict)): return cls([generate_pattern_from_obj(patterns_obj)]) else: # itemized using list or something return cls( [ generate_pattern_from_obj(pattern_obj) for pattern_obj in patterns_obj ] ) def match(self, target): """ Checks if the string match any of patterns. Parameters ---------- target : str the string to check """ return any((pattern.match(target) for pattern in self.patterns)) class ExtensionList: """ Class of extension list to search for audio files in directories. Parameters ---------- extensions : Union[List[str], str] extensions of audio. e.g. `wav`, [`.mp3`, `opus`], or `.m4a`. """ def __init__(self, extensions): extensions = ( [extensions] if isinstance(extensions, str) else extensions ) if not isinstance(extensions, list): raise ValueError("extensions list must be an instance of list.") self.extensions = [ extension.lstrip(".") for extension in extensions ] # remove . from .wav for example def itemize_in_directory(self, directory): """ Search for audio files with the designated extensions in the directory. Parameters ---------- directory : Path The path of the directory where audio files are searched for. Returns ------- audio_paths : Generator[Path, None, None] paths of audio files. """ for extension in self.extensions: yield from directory.glob("*." + extension) class FilePathFilter: """ Class to filter paths of directories. Parameters ---------- only : PatternList patterns directories must match This corresponds to `only` clauses. excepted : PatternList patterns directories must not match This corresponds to `except` clauses. """ def __init__(self, only, excepted): self.only = only self.excepted = excepted def filter(self, path_list): """ Filters list of paths of directories. Lets paths that match any of patterns in `only` clause and none of patterns in `except` clause pass. Parameters ---------- path_list : Iterable[Path] list of paths Returns : Generator[Path, None, None] list of paths """ yield from filter( lambda path: (self.only is None or self.only.match(path.name)) and (self.excepted is None or not self.excepted.match(path.name)), path_list, ) @classmethod def from_obj(cls, only, excepted): """ Generates an instance from objects genrated by parsing YAML. Parameters ---------- only : Union[str, Dict[str, str], Iterable[Union[str, Dict[str, str]]]] parsed contents in `only` clause. excepted : Union[str, Dict[str, str], Iterable[Union[str, Dict[str, str]]]] parsed contents in `except` claus. """ return cls( None if only is None else PatternList.from_obj(only), None if excepted is None else PatternList.from_obj(excepted), ) class GlobalConfiguration: """ Configuration in `config` clause in configuration file. Parameters ---------- config : Dict(YAML-parsed objects) Contents in `config` clause. """ def __init__(self, config): if not isinstance(config, dict): raise ValueError("The argument must be a dictionary.") # default: only wave files are stored self.extensions = ExtensionList(config.get("extensions", ["wav"])) class DataArchive: """ Corresponds to each recipe in `files` clause. Parameters ---------- file_config: Dict[str, Any] parsed contents of each element in `files` clause. global_config : Dict[str, Any] parsed contents in `config` clause. user_option : UserOption User option of this program (verbose etc.). """ def __init__(self, file_config, global_config, user_option): self.name = file_config["name"] self.src_url = file_config["src"] # Leading `/` throws away the path of a directory # where a archive file is extracted self.audio_root_relative = file_config["root"].lstrip("/") self.global_config = global_config self.user_option = user_option self.file_path_filter = FilePathFilter.from_obj( file_config.get("only"), file_config.get("except") ) def download(self, dest_root): """ Downloads archive and extracts audio files. Parameters ---------- dest_root : Path the root path where directories that contains audio files are placed. """ with TemporaryDirectory() as working_dir: working_dir = Path(working_dir) # convert from str # download archive and extract files in the working directory. if self.user_option.verbose: print("Downloading", self.name, "from", self.src_url, "...") archive_path = DataArchive._download_file( self.src_url, working_dir ) if self.user_option.verbose: print("Unpack:", archive_path) shutil.unpack_archive(str(archive_path), str(working_dir)) # move audio files to the destination directory. self._move_all_audio( working_dir / self.audio_root_relative, dest_root ) def _move_all_audio(self, archive_root, dest_root): """ Moves all audio files in archive. Parameters ---------- archive_root : Path root directory of extracted archive. dest_root : Path root directory directories that contain audio files are moved to. """ for directory in self.file_path_filter.filter( filter( lambda file_like: Path.is_dir(file_like), archive_root.iterdir(), ) ): dest_dir = dest_root / directory.name self._move_audio_in_dir(directory, dest_dir) def _move_audio_in_dir(self, src, dest): """ Moves audio files in one directory. Parameters ---------- src : Path the path of the directory audio files are in. dest : Path the path of the directory audio files are moved to. """ if self.user_option.verbose: print("Move:", src.name) os.makedirs(str(dest), exist_ok=True) for wav_file in self.global_config.extensions.itemize_in_directory( src ): dest_path = dest / wav_file.name self._move_file(wav_file, dest_path) def _move_file(self, src, dest): """ Moves one file. """ if dest.exists() and self.user_option.force: dest.unlink() shutil.move(str(src), str(dest)) @staticmethod def _download_file(url, dest=None): """Download and store a remote file. Parameters ---------- url : str or path-like The URL of the remote file. dest : str or path-like or None The path where the downloaded file is stored. if an existing directory is designated Returns ------- The path of the stored file. Raises ------ urllib.error.HTTPError When `the status code is not 200 or 30*. """ with urllib.request.urlopen(url) as request_obj: real_file_name = os.path.basename( urllib.parse.urlparse(request_obj.geturl()).path ) if dest is None: dest = real_file_name elif os.path.isdir(str(dest)): # wrapping in str is for Python 3.5 dest = type(dest)(os.path.join(str(dest), real_file_name)) with open(str(dest), "wb") as file_obj: shutil.copyfileobj(request_obj, file_obj) return dest class Downloader: """ Class to execute all the processes to extract audio files. Parameters ---------- config_path : Path Path of configuration file for downloading corpus user_option : UserOption User option designated in arguments of this program (e.g. verbose) """ def __init__(self, config_path, user_option): with open(config_path) as f: self.all_configs = yaml.load(f) self.user_option = user_option self.global_config = GlobalConfiguration( self.all_configs.get("config", {}) ) self.files = [ DataArchive(file_info_dic, self.global_config, user_option) for file_info_dic in self.all_configs["files"] ] def download(self, dest): """ Downloads archives and extracts and places all audio files Parameters ---------- dest : Path directory where audio files and directories are placed """ for file in self.files: file.download(dest) if __name__ == "__main__": args = docopt(__doc__) is_verbose = not args["--quiet"] # Whether prints regular messages does_by_force = args["--force"] config_path = args["<config>"] base_dir = Path(__file__).parent wav_root_dir = base_dir / "wav" Downloader(config_path, UserOption(is_verbose, does_by_force)).download( wav_root_dir )