python source code of Downloader

import os
import shutil
import urllib
import urllib.parse as urlparse
import urllib.request as urllib2
from glob import glob
from zipfile import ZipFile

from lxml import etree
from tqdm import tqdm

from omrdatasettools.OmrDataset import OmrDataset


class Downloader:
    """ The class for downloading OMR datasets. It downloads the selected dataset from Github and extracts it to
        a specified directory.
    """

    def download_and_extract_dataset(self, dataset: OmrDataset, destination_directory: str):
        """ Starts the download of the dataset and extracts it into the specified directory.

        Examples
        --------
        >>> from omrdatasettools import Downloader, OmrDataset
        >>> downloader = Downloader()
        >>> downloader.download_and_extract_dataset(OmrDataset.HOMUS_V2, "data")

        """
        self.download_and_extract_custom_dataset(dataset.name, dataset.get_dataset_download_url(),
                                                 dataset.get_dataset_filename(), destination_directory)

        if dataset is OmrDataset.Fornes:
            self.__fix_capital_file_endings(os.path.join(os.path.abspath(destination_directory), "Music_Symbols"))

        if dataset in [OmrDataset.MuscimaPlusPlus_V1, OmrDataset.MuscimaPlusPlus_V2]:
            self.__download_muscima_pp_images(dataset, destination_directory)

    def download_and_extract_custom_dataset(self, dataset_name: str, dataset_url: str, dataset_filename: str,
                                            destination_directory: str):
        """ Starts the download of a custom dataset and extracts it into the specified directory.

        Examples
        --------
        >>> from omrdatasettools import Downloader
        >>> downloader = Downloader()
        >>> downloader.download_and_extract_custom_dataset("MyNewOmrDataset", "https://example.org/dataset.zip", "dataset.zip", "data/MyNewOmrDataset")

        """
        if not os.path.exists(dataset_filename):
            print("Downloading {0} dataset...".format(dataset_name))
            self.download_file(dataset_url, dataset_filename)

        print("Extracting {0} dataset...".format(dataset_name))
        self.extract_dataset(os.path.abspath(destination_directory), dataset_filename)

    def download_images_from_mei_annotation(self, dataset: OmrDataset, dataset_directory: str, base_url: str):
        """ Crawls the images of an Edirom dataset, if provided with the respective URL. To avoid repetitive crawling,
            this URL has to be provided manually. If you are interested in these datasets, please contact the authors.

            Examples
            --------
            >>> from omrdatasettools import Downloader, OmrDataset
            >>> downloader = Downloader()
            >>> downloader.download_and_extract_dataset(OmrDataset.Edirom_Bargheer, "data/Bargheer")
            >>> downloader.download_images_from_mei_annotation(OmrDataset.Edirom_Bargheer, "data/Bargheer", "INSERT_DATASET_URL_HERE")

            or

            >>> downloader.download_and_extract_dataset(OmrDataset.Edirom_FreischuetzDigital, "data/Freischuetz")
            >>> downloader.download_images_from_mei_annotation(OmrDataset.Edirom_FreischuetzDigital, "data/Freischuetz", "INSERT_DATASET_URL_HERE")
            """
        if dataset not in [OmrDataset.Edirom_Bargheer, OmrDataset.Edirom_FreischuetzDigital]:
            raise Exception("Only supported for edirom datasets")

        if len(glob(f'{dataset_directory}/*.xml')) == 0:
            print(
                f"Could not find MEI (XML) files in {dataset_directory}/ directory. Can't download images.")

        for source in glob(f'{dataset_directory}/*.xml'):
            base = os.path.splitext(source)[0]
            os.makedirs(base, exist_ok=True)
            print("Downloading dataset for " + base)
            self.__download_edirom_images(base, base_url, source)

    def __download_edirom_images(self, base, base_url, source):
        xml = etree.parse(source).getroot()

        for graphic in tqdm(xml.xpath('//*[local-name()="graphic"]'), desc="Downloading images"):
            url = graphic.get('target')
            filename = os.path.basename(url)
            width = graphic.get('width')
            if os.path.exists(os.path.join(base, filename)):
                pass  # Skipping download, because it has been downloaded already
            else:
                urllib.request.urlretrieve(f"{base_url}/{url}?dw={width}&amp;mo=fit", os.path.join(base, filename))

    def __download_muscima_pp_images(self, dataset: OmrDataset, destination_directory: str):
        # Automatically download the images and measure annotations with the MUSCIMA++ dataset
        muscima_pp_images_filename = dataset.dataset_file_names()["MuscimaPlusPlus_Images"]
        if not os.path.exists(muscima_pp_images_filename):
            print("Downloading MUSCIMA++ images")
            self.download_file(dataset.dataset_download_urls()["MuscimaPlusPlus_Images"], muscima_pp_images_filename)
        absolute_path_to_temp_folder = os.path.abspath('MuscimaPpImages')
        self.extract_dataset(absolute_path_to_temp_folder, muscima_pp_images_filename)
        if dataset is OmrDataset.MuscimaPlusPlus_V1:
            target_folder = os.path.join(os.path.abspath(destination_directory), "v1.0", "data", "images")
        if dataset is OmrDataset.MuscimaPlusPlus_V2:
            target_folder = os.path.join(os.path.abspath(destination_directory), "v2.0", "data", "images")
        self.copytree(os.path.join(absolute_path_to_temp_folder, "fulls"), target_folder)
        self.clean_up_temp_directory(absolute_path_to_temp_folder)

    def __fix_capital_file_endings(self, absolute_path_to_temp_folder):
        image_with_capital_file_ending = [y for x in os.walk(absolute_path_to_temp_folder) for y in
                                          glob(os.path.join(x[0], "*.BMP"))]
        for image in image_with_capital_file_ending:
            os.rename(image, image[:-3] + "bmp")

    @staticmethod
    def copytree(src, dst):
        if not os.path.exists(dst):
            os.makedirs(dst)
        for item in os.listdir(src):
            s = os.path.join(src, item)
            d = os.path.join(dst, item)
            if os.path.isdir(s):
                Downloader.__copytree(s, d)
            else:
                if not os.path.exists(d) or os.stat(s).st_mtime - os.stat(d).st_mtime > 1:
                    shutil.copy2(s, d)

    @staticmethod
    def extract_dataset(absolute_path_to_folder: str, dataset_filename: str):
        archive = ZipFile(dataset_filename, "r")
        archive.extractall(absolute_path_to_folder)
        archive.close()

    @staticmethod
    def clean_up_temp_directory(temp_directory):
        print("Deleting temporary directory {0}".format(temp_directory))
        shutil.rmtree(temp_directory, ignore_errors=True)

    @staticmethod
    def download_file(url, destination_filename=None) -> str:
        u = urllib2.urlopen(url)
        scheme, netloc, path, query, fragment = urlparse.urlsplit(url)
        filename = os.path.basename(path)
        if not filename:
            filename = 'downloaded.file'
        if destination_filename:
            filename = destination_filename

        filename = os.path.abspath(filename)

        with open(filename, 'wb') as f:
            meta = u.info()
            meta_func = meta.getheaders if hasattr(meta, 'getheaders') else meta.get_all
            meta_length = meta_func("Content-Length")
            file_size = None
            if meta_length:
                file_size = int(meta_length[0])
            print("Downloading: {0} Bytes: {1} into {2}".format(url, file_size, filename))

            with tqdm(total=file_size, desc="Downloading (bytes)") as progress_bar:
                file_size_dl = 0
                block_sz = 8192
                while True:
                    buffer = u.read(block_sz)
                    if not buffer:
                        break

                    file_size_dl += len(buffer)
                    f.write(buffer)
                    if file_size:
                        progress_bar.update(len(buffer))
            print()

        return filename