#!/usr/bin/env python3

"""Downloader of sample audio data

Configuration are in the directory downloder_conf.

Usage:
    download_speech_corpus.py <config> [-h] [-q] [-f] [-m]

Parameters:
    <config>        The path of configuration file
    -h, --help      Show this help and exit
    -q, --quiet     Don't show any messages about progress
    -f, --force     Overwrite existing corpus files
"""

import os
import re
import shutil
import urllib.parse
import urllib.request
from fnmatch import fnmatchcase
from pathlib import Path
from tempfile import TemporaryDirectory

import yaml
from docopt import docopt


class UserOption:
    """
    Class (structure) that contains flags given in Parameters
-
    Parameters
    ----------
    expected : str
        string that expresses the pattern
    """

    def __init__(self, verbose=True, force=False):
        self.verbose = verbose
        self.force = force


class FixedStrPattern:
    """
    Class of fixed-string pattern

    Uses duck-typing to commonalize interface `match` for a time.

    Parameters
    ----------
    expected : str
        string that expresses the pattern
    """

    def __init__(self, expected):
        self.expected = expected

    def match(self, target):
        """
        Check the given string matches the pattern.
        """
        return target == self.expected

    def __repr__(self):
        return "{}({})".format(self.__class__.__name__, repr(self.expected))


class RegExPattern:
    """
    Class of regular expression pattern


    Parameters
    ----------
    expected : str
        string that expresses the pattern
    """

    def __init__(self, expected):
        self.expected = re.compile(expected)

    def match(self, target):
        # SREmatch()
        return bool(self.expected.match(target))

    def __repr__(self):
        return "{}({})".format(
            self.__class__.__name__, repr(self.expected.pattern)
        )


class GlobPattern(FixedStrPattern):
    """
    Class of glob pattern

    Like: SF* MF*

    Parameters
    ----------
    expected : str
        string that expresses the pattern
    """

    def match(self, target):
        return fnmatchcase(target, self.expected)


def generate_pattern_from_obj(pattern_obj):
    """
    Generates an appropriate pattern from object from YAML.

    Parameters
    ----------
    pattern_obj : Union[str, Dict[str, str]]
        Parsed YAML object.  Only one object indicated by these expressions is allowed.
        are allowed::

            pattern -> fixed string
            regex: pattern
            regexp: pattern
            -> regexp
            glob: pattern -> glob

    Returns
    -------
    pattern
        Generated pattern instance.
    """
    if isinstance(pattern_obj, str):
        return FixedStrPattern(pattern_obj)
    elif isinstance(pattern_obj, dict) and len(pattern_obj) == 1:
        pattern_type, pattern = next(iter(pattern_obj.items()))  # first item
        if pattern_type in {"regex", "regexp"}:
            return RegExPattern(pattern)
        elif pattern_type == "glob":
            return GlobPattern(pattern)
        else:
            raise ValueError("Unknown pattern type: `{}`.".format(pattern))
    else:
        raise ValueError("Pattern object must be str or dictionary w/ 1 item.")


class PatternList:
    """
    List class of patterns that provides the method `match`.

    Parameters
    ----------
    patterns : List[Pattern]
        List of patterns
    """

    def __init__(self, patterns):
        self.patterns = (
            patterns if isinstance(patterns, list) else list(patterns)
        )

    @classmethod
    def from_obj(cls, patterns_obj):
        """
        Constructs instance from object from YAML.

        objects must be one of like::

            foo

            regex: foo

            - foo
            - glob: bar
            - regexp: baz

        This is used to express `only` or `except` clauses.

        Parameters
        ----------
        patterns_obj : Union[str, Dict[str, str],
        Iterable[Union[str, Dict[str, str]]]]
            object generated by parsing YAML
        """
        # foo or [pattern type]: foo
        if isinstance(patterns_obj, (str, dict)):
            return cls([generate_pattern_from_obj(patterns_obj)])
        else:  # itemized using list or something
            return cls(
                [
                    generate_pattern_from_obj(pattern_obj)
                    for pattern_obj in patterns_obj
                ]
            )

    def match(self, target):
        """
        Checks if the string match any of patterns.

        Parameters
        ----------
        target : str
            the string to check
        """
        return any((pattern.match(target) for pattern in self.patterns))


class ExtensionList:
    """
    Class of extension list to search for audio files in directories.

    Parameters
    ----------
    extensions : Union[List[str], str]
        extensions of audio. e.g. `wav`, [`.mp3`, `opus`], or `.m4a`.
    """

    def __init__(self, extensions):
        extensions = (
            [extensions] if isinstance(extensions, str) else extensions
        )
        if not isinstance(extensions, list):
            raise ValueError("extensions list must be an instance of list.")

        self.extensions = [
            extension.lstrip(".") for extension in extensions
        ]  # remove . from .wav for example

    def itemize_in_directory(self, directory):
        """
        Search for audio files with the designated extensions in the directory.

        Parameters
        ----------
        directory : Path
            The path of the directory where audio files are searched for.

        Returns
        -------
        audio_paths : Generator[Path, None, None]
            paths of audio files.
        """
        for extension in self.extensions:
            yield from directory.glob("*." + extension)


class FilePathFilter:
    """
    Class to filter paths of directories.

    Parameters
    ----------
    only : PatternList
        patterns directories must match
        This corresponds to `only` clauses.
    excepted : PatternList
        patterns directories must not match
        This corresponds to `except` clauses.
    """

    def __init__(self, only, excepted):
        self.only = only
        self.excepted = excepted

    def filter(self, path_list):
        """
        Filters list of paths of directories.

        Lets paths that match any of patterns in `only` clause and
        none of patterns in `except` clause pass.

        Parameters
        ----------
        path_list : Iterable[Path]
            list of paths

        Returns : Generator[Path, None, None]
            list of paths
        """
        yield from filter(
            lambda path: (self.only is None or self.only.match(path.name))
            and (self.excepted is None or not self.excepted.match(path.name)),
            path_list,
        )

    @classmethod
    def from_obj(cls, only, excepted):
        """
        Generates an instance from objects genrated by parsing YAML.

        Parameters
        ----------
        only : Union[str, Dict[str, str],
        Iterable[Union[str, Dict[str, str]]]]
            parsed contents in `only` clause.
        excepted : Union[str, Dict[str, str],
        Iterable[Union[str, Dict[str, str]]]]
            parsed contents in `except` claus.
        """
        return cls(
            None if only is None else PatternList.from_obj(only),
            None if excepted is None else PatternList.from_obj(excepted),
        )


class GlobalConfiguration:
    """
    Configuration in `config` clause in configuration file.

    Parameters
    ----------
    config : Dict(YAML-parsed objects)
        Contents in `config` clause.
    """

    def __init__(self, config):
        if not isinstance(config, dict):
            raise ValueError("The argument must be a dictionary.")

        # default: only wave files are stored
        self.extensions = ExtensionList(config.get("extensions", ["wav"]))


class DataArchive:
    """
    Corresponds to each recipe in `files` clause.

    Parameters
    ----------
    file_config: Dict[str, Any]
        parsed contents of each element in `files` clause.
    global_config : Dict[str, Any]
        parsed contents in `config` clause.
    user_option : UserOption
        User option of this program (verbose etc.).
    """

    def __init__(self, file_config, global_config, user_option):
        self.name = file_config["name"]
        self.src_url = file_config["src"]
        # Leading `/` throws away the path of a directory
        # where a archive file is extracted
        self.audio_root_relative = file_config["root"].lstrip("/")
        self.global_config = global_config
        self.user_option = user_option
        self.file_path_filter = FilePathFilter.from_obj(
            file_config.get("only"), file_config.get("except")
        )

    def download(self, dest_root):
        """
        Downloads archive and extracts audio files.

        Parameters
        ----------
        dest_root : Path
            the root path where directories
            that contains audio files are placed.
        """
        with TemporaryDirectory() as working_dir:
            working_dir = Path(working_dir) # convert from str

            # download archive and extract files in the working directory.
            if self.user_option.verbose:
                print("Downloading", self.name, "from", self.src_url, "...")
            archive_path = DataArchive._download_file(
                self.src_url, working_dir
            )
            if self.user_option.verbose:
                print("Unpack:", archive_path)
            shutil.unpack_archive(str(archive_path), str(working_dir))

            # move audio files to the destination directory.
            self._move_all_audio(
                working_dir / self.audio_root_relative, dest_root
            )

    def _move_all_audio(self, archive_root, dest_root):
        """
        Moves all audio files in archive.

        Parameters
        ----------
        archive_root : Path
            root directory of extracted archive.
        dest_root : Path
            root directory directories that contain audio files are moved to.
        """
        for directory in self.file_path_filter.filter(
            filter(
                lambda file_like: Path.is_dir(file_like),
                archive_root.iterdir(),
            )
        ):
            dest_dir = dest_root / directory.name
            self._move_audio_in_dir(directory, dest_dir)

    def _move_audio_in_dir(self, src, dest):
        """
        Moves audio files in one directory.

        Parameters
        ----------
        src : Path
            the path of the directory audio files are in.
        dest : Path
            the path of the directory audio files are moved to.
        """
        if self.user_option.verbose:
            print("Move:", src.name)
        os.makedirs(str(dest), exist_ok=True)
        for wav_file in self.global_config.extensions.itemize_in_directory(
            src
        ):
            dest_path = dest / wav_file.name
            self._move_file(wav_file, dest_path)

    def _move_file(self, src, dest):
        """
        Moves one file.
        """
        if dest.exists() and self.user_option.force:
            dest.unlink()
        shutil.move(str(src), str(dest))

    @staticmethod
    def _download_file(url, dest=None):
        """Download and store a remote file.

        Parameters
        ----------
        url : str or path-like
            The URL of the remote file.
        dest : str or path-like or None
            The path where the downloaded file is stored.
            if an existing directory is designated

        Returns
        -------
        The path of the stored file.

        Raises
        ------
        urllib.error.HTTPError
            When `the status code is not 200 or 30*.
        """
        with urllib.request.urlopen(url) as request_obj:
            real_file_name = os.path.basename(
                urllib.parse.urlparse(request_obj.geturl()).path
            )
            if dest is None:
                dest = real_file_name
            elif os.path.isdir(str(dest)):  # wrapping in str is for Python 3.5
                dest = type(dest)(os.path.join(str(dest), real_file_name))
            with open(str(dest), "wb") as file_obj:
                shutil.copyfileobj(request_obj, file_obj)
        return dest


class Downloader:
    """
    Class to execute all the processes to extract audio files.

    Parameters
    ----------
    config_path : Path
        Path of configuration file for downloading corpus
    user_option : UserOption
        User option designated in arguments of this program (e.g. verbose)
    """
    def __init__(self, config_path, user_option):
        with open(config_path) as f:
            self.all_configs = yaml.load(f)
        self.user_option = user_option
        self.global_config = GlobalConfiguration(
            self.all_configs.get("config", {})
        )
        self.files = [
            DataArchive(file_info_dic, self.global_config, user_option)
            for file_info_dic in self.all_configs["files"]
        ]

    def download(self, dest):
        """
        Downloads archives and extracts and places all audio files

        Parameters
        ----------
        dest : Path
            directory where audio files and directories are placed
        """
        for file in self.files:
            file.download(dest)


if __name__ == "__main__":
    args = docopt(__doc__)
    is_verbose = not args["--quiet"]  # Whether prints regular messages
    does_by_force = args["--force"]
    config_path = args["<config>"]

    base_dir = Path(__file__).parent
    wav_root_dir = base_dir / "wav"

    Downloader(config_path, UserOption(is_verbose, does_by_force)).download(
        wav_root_dir
    )