python source code of filelister

"""
Copyright (c) 2017 Wind River Systems, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at:

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software  distributed
under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
OR CONDITIONS OF ANY KIND, either express or implied.
"""

import shutil
import zipfile
import tarfile
import gzip
import bz2
import lzma
import re
import glob
import tempfile
import time
import os
from os.path import join, relpath, basename, abspath, exists, isfile, \
    isdir, dirname, normpath, islink
from os import pardir, makedirs, walk, remove
from urllib.request import urlopen, URLError, HTTPError
from urllib.parse import urlparse
from cryptodetector import Output, is_rpm, extract_rpm, Logger
from cryptodetector.exceptions import InvalidPackageException, ExtractError, \
    DownloadError, FileWriteException

class FileLister():
    """Class for gathering the list of files for each pacakge. A package can be a local archive,
    directory, wild-card address, link to a remote archive, or a git repository link.
    """
    GITHUB_REGEX = r"((?:git@github\.com\:)|(?:http[s]?://github.com/))" \
        + r"([^/]+)\/((?:(?!\.git)[^/])+)(?:\.git)?"

    all_temp_dirs = set()

    def __init__(self, packages,
                 skip_existing=False,
                 output_directory=None,
                 output_in_package_directory=False):
        """Initializer

        Args:
            packages: (list)
            skip_existing: (bool) whether we should skip listing package if output crypto file
                already exists.
            output_directory: (string)
            output_in_package_directory: (bool) whether output crypto file is placed in the same
                directory as the package.

        Returns:
            None
        """
        self.base_tmp = join(tempfile.gettempdir(),"cryptodetector")
        makedirs(self.base_tmp,exist_ok=True)
        self.tmp_directories = set()
        FileLister.validate_package_list(packages)
        self.skip_existing = skip_existing
        self.output_directory = output_directory
        self.output_in_package_directory = output_in_package_directory

    def get_package_filelist(self, package):
        """Gather list of files in a package

        Args:
            package: (string) can specify a file, folder, wild-card, github, or a url

        Returns:
            (list) A list of file-lists, one file-list for each package found in `package` given
                above. In most cases, it is a list of only one file-list. But when package is a
                wild-card address, it can consist of multiple packages.

                A file-list is a dict object containing "package_name", "package_root", and
                "file_list". package_name is a string for name of the package, package_root is the
                directory containing the package (None if package is not a local one), and
                "file_list" is a list of in the package. A file is a dict with two keys
                "display_path" and "physical_path". "display_path" is the path that's shown to the
                user, but might not neccessarily be where the file physically resides, whereas
                "physical_path" is where file can be accessed. For example,
                "/path/arch.tar.gz/file.cpp" is a display_path, while "/tmp/cryptodetector/file.cpp"
                is the physcial_path.

                As an example, it could be:
                [{
                    "package_name": "test.tar.gz",
                    "package_root": "/home",
                    "file_list": [
                        {
                         "display_path": "/home/test.tar.gz/file1.cpp"
                         "physical_path": "/tmp/cryptodetector/file.cpp"
                        },
                        {
                         "display_path": "/home/test.tar.gz/file2.cpp"
                         "physical_path": "/tmp/cryptodetector/file2.cpp"
                        },
                        ...
                    ]
                }]
        """
        if isfile(package):
            return self.list_file(package)

        elif isdir(package):
            return self.list_directory(package)

        elif FileLister.is_wild_card(package):
            return self.list_wildcard(package)

        elif FileLister.is_github_address(package):
            return self.list_github_master(package)

        elif FileLister.is_url(package):
            return self.list_url(package)

    @staticmethod
    def validate_package_list(package_list):
        """Validate list of packages

        Args:
            package_list: (list) list of strings specifying packages

        Returns:
            None

        Raises:
            InvalidPackageException
        """
        for package in package_list:
            if not(isfile(package) or \
                   isdir(package) or \
                   FileLister.is_wild_card(package) or \
                   FileLister.is_github_address(package) or \
                   is_rpm(package) or \
                   FileLister.is_url(package)):
                raise InvalidPackageException("Invalid package: " + package \
                    + ". It wasn't a file, directory, an archive, " \
                    + "a wild-card expression, github address, or a URL.")

    def skip_package(self, package_name, package_root):
        """Check to see if we should skip listing this package if the crypto file already exists

        Args:
            package_name: (string) package name
            package_root: (string) the directory where package is located. If package is not
                a local one, this is None

        Returns:
            (bool) if we should skip listing this package
        """
        if not self.skip_existing:
            return False

        output_directory = self.output_directory
        if package_root and self.output_in_package_directory:
            output_directory = package_root

        crypto_file_path = join(output_directory, package_name + ".crypto")
        crypto_exists = isfile(crypto_file_path)

        if crypto_exists:
            skip_message = "Found a crypto file for package " \
                + package_name + " at " + crypto_file_path + ". Will skip scanning this package."
            Output.print_information(skip_message)
            Logger.log(skip_message)
        return crypto_exists

    def list_file(self, file_path, tmp_root_path="", current_path=""):
        """List a single file as package

        Args:
            file_path: (string) file path
            tmp_root_path: (string) if file is in a tmp directory, this is the address of that
                directory, otherwise null.
            current_path: (string) current address within the temporary directory. If we are not in
                a tmp directory, this is also null. This is used to compute the display path.

        Returns:
            (list) a list containing one file-list for this file.
        """
        archive_type = FileLister.archive_type(file_path)

        package_name = basename(file_path)
        package_root = abspath(dirname(file_path))

        if tmp_root_path:
            package_root = None

        if self.skip_package(package_name, package_root):
            return []

        # if this is itself a cyrpto file, don't list it as a package
        if file_path.split(".")[-1] == "crypto":
            Output.print_information("\nThe file " + file_path + " has a .crypto extention. " \
                + "This is reserved for the output of this program. Will not list this file " \
                + "as a package.")
            return []

        if tmp_root_path:
            display_path = join(current_path, relpath(file_path, tmp_root_path))
        else:
            display_path = abspath(file_path)

        if archive_type:
            tmp_dir = self.create_tmp_directory(package_name)
            FileLister.extract_archive(archive_type, file_path, display_path, tmp_dir)

            return self.list_directory(tmp_dir, package_name, tmp_root_path=tmp_dir, \
                current_path=display_path, _package_root=package_root)

        else:
            display_path = file_path
            if tmp_root_path:
                display_path = join(current_path, relpath(file_path, tmp_root_path))

            return [{
                "package_name": package_name,
                "package_root": package_root,
                "file_list": [{"display_path": display_path, "physical_path": file_path}]
            }]

    def list_directory(self, path, package_name=None, tmp_root_path="", current_path="", \
        _package_root=None):
        """List a directory as a package

        Args:
            path: (string) path of the directory
            package_name: (string) name of the package
            tmp_root_path: (string) if file is in a tmp directory, this is the address of that
                directory, otherwise null.
            current_path: (string) current address within the temporary directory. If we are not in
                a tmp directory, this is also null. This is used to compute the display path.
            _package_root: (string) when listing a local archive, this is used to keep track of its
                parent directory

        Returns:
            (list) a list containing one file-list for this directory.
        """
        if not package_name:
            package_name = basename(normpath(path))

        package_root = None
        if not tmp_root_path:
            package_root = abspath(join(path, pardir))
        elif _package_root:
            package_root = _package_root

        if self.skip_package(package_name, package_root):
            return []

        return [{
            "package_name": package_name,
            "package_root": package_root,
            "file_list": self.get_directory_filelist(path, tmp_root_path, current_path)
        }]

    def get_directory_filelist(self, path, tmp_root_path, current_path):
        """Recursively list all the files in a directory, extracting all the archives inside.

        Args:
            path: (string) path of the directory
            tmp_root_path: (string) if the directory is inside of a tmp directory, this is the
                address of that directory, otherwise null.
            current_path: (string) current address within the temporary directory. If we are not in
                a tmp directory, this is also null. This is used to compute the display path.

        Returns:
            (list) a list of files, where each file is a dict with two keys "display_path" and
            "physical_path". "display_path" is the path that's shown to the user and "physical_path"
            is where file can be accessed.
        """
        file_list = []

        for dirpath, _, filenames in walk(path, followlinks=False):
            for filename in filenames:
                full_path = abspath(join(dirpath, filename))
                if islink(full_path):
                    Output.print_warning("Skipping symbolic link: " + full_path)
                    continue

                archive_type = FileLister.archive_type(full_path)

                if archive_type:
                    tmp_dir = self.create_tmp_directory(full_path)

                    if tmp_root_path:
                        display_path = join(current_path, relpath(full_path, tmp_root_path))
                    else:
                        display_path = full_path

                    try:
                        FileLister.extract_archive(archive_type, full_path, display_path, tmp_dir)
                    except ExtractError as expn:
                        Output.print_error(str(expn))
                        continue

                    file_list.extend(self.get_directory_filelist(tmp_dir, \
                        tmp_root_path=tmp_dir, current_path=display_path))
                else:
                    if tmp_root_path:
                        file_list.append({
                            "display_path": join(current_path, relpath(full_path, tmp_root_path)),
                            "physical_path": full_path
                        })
                    else:
                        file_list.append({"display_path": full_path, "physical_path": full_path})

        return file_list

    def list_url(self, url):
        """List the file(s) at the given URL

        Args:
            url: (string)

        Returns:
            (list) a list containing one file-list for this url.
        """
        tmp_dir = self.create_tmp_directory(url)
        file_path = FileLister.download_file(url, tmp_dir)
        return self.list_file(file_path, tmp_root_path=tmp_dir)

    def list_github_master(self, github_address):
        """Download the master branch from GitHub and list it

        Args:
            github_address: (string)

        Returns:
            (list) a list containing one file-list for the master branch of this GitHub link
        """
        match = re.search(FileLister.GITHUB_REGEX, github_address)
        owner, repo = match.group(2), match.group(3)
        package_name = owner + "-" + repo + "-master"
        if self.skip_package(package_name, package_root=None):
            return []
        master_url = "https://github.com/" + owner + "/" + repo + "/archive/master.zip"
        tmp_dir = self.create_tmp_directory(master_url)
        master_zip_file = FileLister.download_file(master_url, tmp_dir)
        display_path = package_name + " /master.zip"
        FileLister.extract_zip(master_zip_file, display_path, tmp_dir)
        remove(master_zip_file)
        return self.list_directory(tmp_dir, package_name, tmp_dir)

    def list_wildcard(self, wildcard_path):
        """Add every path in the wild-card expansion

        Args:
            wildcard_path: (string)

        Returns:
            (list) a list of multiple file-lists, one for each package found in the
                wild-card address
        """
        result = []
        for path in glob.glob(wildcard_path):
            if isfile(path):
                result.extend(self.list_file(path))
            else:
                result.extend(self.list_directory(path))
        return result

    def list_rpm(self, rpm_file_path):
        """Extract an RPM archive and list its files.

        Args:
            rpm_file_path: (string)

        Returns:
            (list) a list containing one file-list containing all the files in the RPM archive
        """
        package_name = basename(rpm_file_path)
        tpm_dir = self.create_tmp_directory(package_name)
        FileLister.extract_rpm_archive(rpm_file_path, tpm_dir)
        return self.list_directory(tpm_dir, package_name, tpm_dir)

    @staticmethod
    def is_wild_card(path):
        """Determine if path is a wild-card address

        Args:
            path: (string)

        Returns:
            (bool) whether path is a wild-card address
        """
        return bool(next(glob.iglob(path), None))

    @staticmethod
    def is_github_address(address):
        """Determine if address is a github address

        Args:
            address: (string)

        Returns:
            (bool) whether address is a github address
        """
        return bool(re.search(FileLister.GITHUB_REGEX, address))

    @staticmethod
    def is_url(url):
        """Determine if a string is a valid URL

        Args:
            url: (string)

        Returns:
            (bool) whether the given url is valid
        """
        parsed_url = urlparse(url)
        return bool(parsed_url.scheme) and bool(parsed_url.netloc)

    @staticmethod
    def compression_library_reads(library, archive_path):
        """Checks to see if the given library can read the file at archive_path

        Args:
            library: (module)
            archive_path: (string)

        Returns:
            (bool)
        """
        try:
            with library.open(archive_path) as archive_file:
                compressed_payload = archive_file.read()
                return len(compressed_payload) > 5
        except:
            return False

    @staticmethod
    def archive_type(archive_path):
        """Determine if the file at archive_path is a compressed archive file

        Args:
            archive_path: (string)

        Returns:
            (string) the type of archive or None
        """

        archive_tests = [
            ("zip", lambda path: zipfile.is_zipfile(path)),
            ("tar", lambda path: tarfile.is_tarfile(path)),
            ("rpm", lambda path: is_rpm(path)),
            ("gzip", lambda path: FileLister.compression_library_reads(gzip, path)),
            ("bz2", lambda path: FileLister.compression_library_reads(bz2, path)),
            ("lzma", lambda path: FileLister.compression_library_reads(lzma, path))
            ]

        for archive_type_, archive_library in archive_tests:
            is_archive_file = False
            try:
                is_archive_file = archive_library(archive_path)
            except:
                pass
            if is_archive_file:
                return archive_type_

        return None

    @staticmethod
    def extract_archive(archive_type, full_path, display_path, tmp_dir):
        """Extract the given archive at tmp_dir"""
        if archive_type == "zip":
            FileLister.extract_zip(full_path, display_path, tmp_dir)
        elif archive_type == "tar":
            FileLister.extract_tar(full_path, display_path, tmp_dir)
        elif archive_type == "rpm":
            FileLister.extract_rpm_archive(full_path, display_path, tmp_dir)
        elif archive_type == "gzip":
            FileLister.extract_by_library(gzip, full_path, display_path, tmp_dir)
        elif archive_type == "bz2":
            FileLister.extract_by_library(bz2, full_path, display_path, tmp_dir)
        elif archive_type == "lzma":
            FileLister.extract_by_library(lzma, full_path, display_path, tmp_dir)

        # for some reason, when we get here, sometimes extraction is not fully finished, and
        # somemtimes it is. The operating systems sometimes isn't fully ready to read all the files.
        # Here we rename the parent directory to make extraction an atomic operation.
        while True:
            try:
                os.rename(tmp_dir, tmp_dir)
                break
            except:
                # This rename operating can fail. For example, when a file is still open
                # by another process, Windows will not allow anyone to rename the directory in which
                # that file resides. So here we sleep for one second to retry again.
                time.sleep(1)


    @staticmethod
    def extract_zip(zip_file_path, display_path, output_directory):
        """Extract a zip file

        Args:
            zip_file_path: (string) physical path of file on the hardware
            display_path: (string) file path that should be displayed to the user
            output_directory: (string)

        Returns:
            None

        Raises:
            ExtractError
        """
        Output.print_information("Extracting zip archive " + display_path + " ...")
        try:
            with zipfile.ZipFile(zip_file_path) as zip_file:
                zip_file.extractall(output_directory)
        except Exception as expn:
            raise ExtractError("Failed to extract zip archive " + display_path + "\n" + str(expn))

    @staticmethod
    def extract_tar(tar_file_path, display_path, output_directory):
        """Extract a tar archive

        Args:
            tar_file_path: (string) physical path of file on the hardware
            display_path: (string) file path that should be displayed to the user
            output_directory: (string)

        Returns:
            None

        Raises:
            ExtractError
        """
        Output.print_information("Extracting tar archive " + display_path + " ...")
        try:
            with tarfile.open(tar_file_path) as tar_file:
                tar_file.extractall(output_directory)
        except Exception as expn:
            raise ExtractError("Failed to extract tar archive " + display_path + "\n" + str(expn))

    @staticmethod
    def extract_by_library(library, archive_path, display_path, output_directory):
        """Extracts the given archive file to the output directory using the given library

        Args:
            library: (module)
            archive_path: (string) physical path of file on the hardware
            display_path: (string) file path that should be displayed to the user
            output_directory: (string)

        Returns:
            None

        Raises:
            ExtractError
        """
        library_name = library.__name__
        Output.print_information("Extracting " + library_name + " archive " \
            + display_path + " ...")

        try:
            with library.open(archive_path, "rb") as archive_file:
                decompressed_data = archive_file.read()

                # remove the extension from filename
                filename = basename(archive_path)
                if len(filename.split(".")) > 1:
                    filename = ".".join(filename.split(".")[:-1])

                with open(abspath(join(output_directory, filename)), "wb") as decomp_file:
                    decomp_file.write(decompressed_data)
        except Exception as expn:
            raise ExtractError("Failed to extract " + library_name + " archive " \
                + display_path + "\n" + str(expn))

    @staticmethod
    def extract_rpm_archive(archive_path, display_path, output_directory):
        """Extract RPM archive

        Args:
            archive_path: (string) physical path of file on the hardware
            display_path: (string) file path that should be displayed to the user
            output_directory: (string)

        Returns:
            None

        Raises:
            ExtractError
        """
        Output.print_information("Extracting RPM archive " \
            + display_path + " ...")
        try:
            extract_rpm(archive_path, output_directory)
        except Exception as expn:
            raise ExtractError("Failed to extract RPM archive " + display_path \
                + "\n\n" + str(expn))

    @staticmethod
    def download_file(url, download_directory):
        """Download a remote file

        Args:
            download_directory: (string)

        Returns:
            (string) that path of the file that was just downloaded. If something failed during
                download, return None

        Raises:
            DownloadError
        """
        Output.print_information("Downloading " + url + " ...")

        parsed_url = urlparse(url)
        if parsed_url.path in ["/", ""]:
            file_name = parsed_url.netloc
        else:
            file_name = parsed_url.path.split("/")[-1]
        download_path = abspath(join(download_directory, file_name))

        try:
            with open(download_path, 'wb') as file_object:
                file_object.write(urlopen(url).read())
                return download_path

        except HTTPError as expn:
            raise DownloadError("HTTP error code " + str(expn.code) + " while retrieving " \
             + url + "\n" + str(expn.reason))
        except URLError as expn:
            raise DownloadError("HTTP URL error while retrieving " + url + "\n" + str(expn.reason))
        except Exception as expn:
            raise DownloadError("Unable to retrieve " + url + "\n" + str(expn))

    def create_tmp_directory(self, dir_name):
        """Create a temporary directory

        Args:
            dir_name: (string) directory name

        Returns:
            (string) full path of the newly created tmp directory

        Raises:
            FileWriteException
        """
        try:
            tmp_dir = tempfile.mkdtemp(dir=self.base_tmp)
        except Exception as expn:
            raise FileWriteException("Failed to create temporary directory " + tmp_dir \
                + "\n" + str(expn))
        else:
            self.tmp_directories.add(tmp_dir)
            FileLister.all_temp_dirs.add(tmp_dir)

        return tmp_dir

    @staticmethod
    def set_tree_perms(tdir):
        """ Set permissions so we can delete files and directories"""
        import stat

        os.chmod(tdir,stat.S_IRWXU)
        for root,dirs,files in os.walk(tdir):
            for n in dirs:
                dd = join(root,n)
                os.chmod(dd,stat.S_IRWXU)
            for n in files:
                dd = join(root,n)
                os.chmod(dd,(stat.S_IRUSR | stat.S_IWUSR))

    def cleanup_tmp_folder(self):
        """Clean up temporary folder

        Args:
            None

        Returns:
            None
        """
        lose = set()
        # do not modify tmp_directories during the loop
        for tmp_dir in self.tmp_directories:
            if exists(tmp_dir):
                try:
                    shutil.rmtree(tmp_dir)
                except:
                    # directories that cannot be searched cause problems
                    try:
                        FileLister.set_tree_perms(tmp_dir)
                        shutil.rmtree(tmp_dir)
                    except Exception as e:
                        Output.print_warning("Temp directory %s was not removed (%s)" % (tmp_dir,str(e)))
                        continue
            lose.add(tmp_dir)

        FileLister.all_temp_dirs -= lose
        self.tmp_directories -= lose
        Output.print_information("Temp dir count is %s %s" % (len(self.tmp_directories),len(FileLister.all_temp_dirs)) )
        # Output.print_information("tmp_tmp name is %s" % (self.tmp_tmp.name) )

    @staticmethod
    def cleanup_all_tmp_files():
        """Clean up all temporary directories in case something went wrong during scan"""
        for tmp_dir in FileLister.all_temp_dirs:
            if exists(tmp_dir):
                try: 
                    shutil.rmtree(tmp_dir)
                except:
                    pass