python source code of utils

"""scrape utility functions.

   Functions include:
   Web requests and requests caching
   Document caching
   Text processing
   HTML parsing
   URL processing
   File processing
   User input and sanitation
   Miscellaneous
"""

from __future__ import print_function
import glob
import hashlib
import os
import random
import re
import shutil
import string
import sys
import time

import lxml.html as lh

try:
    import pdfkit as pk
except ImportError:
    pass
import requests
from requests.exceptions import MissingSchema
from six import PY2
from six.moves import input, xrange as range
from six.moves.urllib.parse import urlparse, urljoin
from six.moves.urllib.request import getproxies
import tldextract

if PY2:
    from cgi import escape
else:
    from html import escape

USER_AGENTS = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) "
    "Gecko/20100101 Firefox/11.0",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) " "Gecko/20100 101 Firefox/22.0",
    "Mozilla/5.0 (Windows NT 6.1; rv:11.0) " "Gecko/20100101 Firefox/11.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) "
    "AppleWebKit/536.5 (KHTML, like Gecko) "
    "Chrome/19.0.1084.46 Safari/536.5",
    "Mozilla/5.0 (Windows; Windows NT 6.1) "
    "AppleWebKit/536.5 (KHTML, like Gecko) "
    "Chrome/19.0.1084.46 Safari/536.5",
)


XDG_CACHE_DIR = os.environ.get(
    "XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")
)
CACHE_DIR = os.path.join(XDG_CACHE_DIR, "scrape")
CACHE_FILE = os.path.join(CACHE_DIR, "cache{0}".format("" if PY2 else "3"))

# Web requests and requests caching functions
#


def get_proxies():
    """Get available proxies to use with requests library."""
    proxies = getproxies()
    filtered_proxies = {}
    for key, value in proxies.items():
        if key.startswith("http://"):
            if not value.startswith("http://"):
                filtered_proxies[key] = "http://{0}".format(value)
            else:
                filtered_proxies[key] = value
    return filtered_proxies


def get_resp(url):
    """Get webpage response as an lxml.html.HtmlElement object."""
    try:
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        try:
            request = requests.get(url, headers=headers, proxies=get_proxies())
        except MissingSchema:
            url = add_protocol(url)
            request = requests.get(url, headers=headers, proxies=get_proxies())
        return lh.fromstring(request.text.encode("utf-8") if PY2 else request.text)
    except Exception:
        sys.stderr.write("Failed to retrieve {0}.\n".format(url))
        raise


def get_raw_resp(url):
    """Get webpage response as a unicode string."""
    try:
        headers = {"User-Agent": random.choice(USER_AGENTS)}
        try:
            request = requests.get(url, headers=headers, proxies=get_proxies())
        except MissingSchema:
            url = add_protocol(url)
            request = requests.get(url, headers=headers, proxies=get_proxies())
        return request.text.encode("utf-8") if PY2 else request.text
    except Exception:
        sys.stderr.write("Failed to retrieve {0} as str.\n".format(url))
        raise


def enable_cache():
    """Enable requests library cache."""
    try:
        import requests_cache
    except ImportError as err:
        sys.stderr.write("Failed to enable cache: {0}\n".format(str(err)))
        return
    if not os.path.exists(CACHE_DIR):
        os.makedirs(CACHE_DIR)
    requests_cache.install_cache(CACHE_FILE)


def clear_cache():
    """Clear requests library cache."""
    for cache in glob.glob("{0}*".format(CACHE_FILE)):
        os.remove(cache)


# Document caching functions
#


def hash_text(text):
    """Return MD5 hash of a string."""
    md5 = hashlib.md5()
    md5.update(text)
    return md5.hexdigest()


def cache_page(page_cache, page_hash, cache_size):
    """Add a page to the page cache."""
    page_cache.append(page_hash)
    if len(page_cache) > cache_size:
        page_cache.pop(0)


# Text processing functions
#


def re_filter(text, regexps):
    """Filter text using regular expressions."""
    if not regexps:
        return text

    matched_text = []
    compiled_regexps = [re.compile(x) for x in regexps]
    for line in text:
        if line in matched_text:
            continue

        for regexp in compiled_regexps:
            found = regexp.search(line)
            if found and found.group():
                matched_text.append(line)

    return matched_text or text


def remove_whitespace(text):
    """Remove unnecessary whitespace while keeping logical structure.

    Keyword arguments:
    text -- text to remove whitespace from (list)

    Retain paragraph structure but remove other whitespace,
    such as between words on a line and at the start and end of the text.
    """
    clean_text = []
    curr_line = ""
    # Remove any newlines that follow two lines of whitespace consecutively
    # Also remove whitespace at start and end of text
    while text:
        if not curr_line:
            # Find the first line that is not whitespace and add it
            curr_line = text.pop(0)
            while not curr_line.strip() and text:
                curr_line = text.pop(0)
            if curr_line.strip():
                clean_text.append(curr_line)
        else:
            # Filter the rest of the lines
            curr_line = text.pop(0)
            if not text:
                # Add the final line if it is not whitespace
                if curr_line.strip():
                    clean_text.append(curr_line)
                continue

            if curr_line.strip():
                clean_text.append(curr_line)
            else:
                # If the current line is whitespace then make sure there is
                # no more than one consecutive line of whitespace following
                if not text[0].strip():
                    if len(text) > 1 and text[1].strip():
                        clean_text.append(curr_line)
                else:
                    clean_text.append(curr_line)

    # Now filter each individual line for extraneous whitespace
    cleaner_text = []
    for line in clean_text:
        clean_line = " ".join(line.split())
        if not clean_line.strip():
            clean_line += "\n"
        cleaner_text.append(clean_line)
    return cleaner_text


def parse_text(infile, xpath=None, filter_words=None, attributes=None):
    """Filter text using XPath, regex keywords, and tag attributes.

    Keyword arguments:
    infile -- HTML or text content to parse (list)
    xpath -- an XPath expression (str)
    filter_words -- regex keywords (list)
    attributes -- HTML tag attributes (list)

    Return a list of strings of text.
    """
    infiles = []
    text = []
    if xpath is not None:
        infile = parse_html(infile, xpath)
        if isinstance(infile, list):
            if isinstance(infile[0], lh.HtmlElement):
                infiles = list(infile)
            else:
                text = [line + "\n" for line in infile]
        elif isinstance(infile, lh.HtmlElement):
            infiles = [infile]
        else:
            text = [infile]
    else:
        infiles = [infile]

    if attributes is not None:
        attributes = [clean_attr(x) for x in attributes]
        attributes = [x for x in attributes if x]
    else:
        attributes = ["text()"]

    if not text:
        text_xpath = "//*[not(self::script) and not(self::style)]"
        for attr in attributes:
            for infile in infiles:
                if isinstance(infile, lh.HtmlElement):
                    new_text = infile.xpath("{0}/{1}".format(text_xpath, attr))
                else:
                    # re.split preserves delimiters place in the list
                    new_text = [x for x in re.split("(\n)", infile) if x]
                text += new_text

    if filter_words is not None:
        text = re_filter(text, filter_words)
    return [
        "".join(x for x in line if x in string.printable)
        for line in remove_whitespace(text)
        if line
    ]


def get_parsed_text(args, infilename):
    """Parse and return text content of infiles.

    Keyword arguments:
    args -- program arguments (dict)
    infilenames -- name of user-inputted and/or downloaded file (str)

    Return a list of strings of text.
    """
    parsed_text = []
    if infilename.endswith(".html"):
        # Convert HTML to lxml object for content parsing
        html = lh.fromstring(read_files(infilename))
        text = None
    else:
        html = None
        text = read_files(infilename)

    if html is not None:
        parsed_text = parse_text(
            html, args["xpath"], args["filter"], args["attributes"]
        )
    elif text is not None:
        parsed_text = parse_text(text, args["xpath"], args["filter"])
    else:
        if not args["quiet"]:
            sys.stderr.write("Failed to parse text from {0}.\n".format(infilename))
    return parsed_text


# HTML parsing functions
#


def clean_attr(attr):
    """Append @ to attributes and resolve text -> text() for XPath."""
    if attr:
        if "text" in attr:
            return "text()"
        else:
            attr = attr.lstrip("@")
    if attr:
        return "@" + attr
    return None


def parse_html(infile, xpath):
    """Filter HTML using XPath."""
    if not isinstance(infile, lh.HtmlElement):
        infile = lh.fromstring(infile)
    infile = infile.xpath(xpath)
    if not infile:
        raise ValueError("XPath {0} returned no results.".format(xpath))
    return infile


# URL processing functions
#


def get_domain(url):
    """Get the domain of a URL using tldextract."""
    return tldextract.extract(url).domain


def add_protocol(url):
    """Add protocol to URL."""
    if not check_protocol(url):
        return "http://{0}".format(url)
    return url


def check_protocol(url):
    """Check URL for a protocol."""
    if url and (url.startswith("http://") or url.startswith("https://")):
        return True
    return False


def remove_protocol(url):
    """Remove protocol from URL."""
    if check_protocol(url):
        return url.replace("http://", "").replace("https://", "")
    return url


def clean_url(url, base_url=None):
    """Add base netloc and path to internal URLs and remove www, fragments."""
    parsed_url = urlparse(url)

    fragment = "{url.fragment}".format(url=parsed_url)
    if fragment:
        url = url.split(fragment)[0]

    # Identify internal URLs and fix their format
    netloc = "{url.netloc}".format(url=parsed_url)
    if base_url is not None and not netloc:
        parsed_base = urlparse(base_url)
        split_base = "{url.scheme}://{url.netloc}{url.path}/".format(url=parsed_base)
        url = urljoin(split_base, url)
        netloc = "{url.netloc}".format(url=urlparse(url))

    if "www." in netloc:
        url = url.replace(netloc, netloc.replace("www.", ""))
    return url.rstrip(string.punctuation)


def has_suffix(url):
    """Return whether the url has a suffix using tldextract."""
    return bool(tldextract.extract(url).suffix)


def add_url_suffix(url):
    """Add .com suffix to URL if none found."""
    url = url.rstrip("/")
    if not has_suffix(url):
        return "{0}.com".format(url)
    return url


# File processing functions
#


def get_outfilename(url, domain=None):
    """Construct the output filename from domain and end of path."""
    if domain is None:
        domain = get_domain(url)

    path = "{url.path}".format(url=urlparse(url))
    if "." in path:
        tail_url = path.split(".")[-2]
    else:
        tail_url = path

    if tail_url:
        if "/" in tail_url:
            tail_pieces = [x for x in tail_url.split("/") if x]
            tail_url = tail_pieces[-1]

        # Keep length of return string below or equal to max_len
        max_len = 24
        if domain:
            max_len -= len(domain) + 1
        if len(tail_url) > max_len:
            if "-" in tail_url:
                tail_pieces = [x for x in tail_url.split("-") if x]
                tail_url = tail_pieces.pop(0)
                if len(tail_url) > max_len:
                    tail_url = tail_url[:max_len]
                else:
                    # Add as many tail pieces that can fit
                    tail_len = 0
                    for piece in tail_pieces:
                        tail_len += len(piece)
                        if tail_len <= max_len:
                            tail_url += "-" + piece
                        else:
                            break
            else:
                tail_url = tail_url[:max_len]

        if domain:
            return "{0}-{1}".format(domain, tail_url).lower()
        return tail_url
    return domain.lower()


def get_single_outfilename(args):
    """Use first possible entry in query as filename."""
    for arg in args["query"]:
        if arg in args["files"]:
            return (".".join(arg.split(".")[:-1])).lower()
        for url in args["urls"]:
            if arg.strip("/") in url:
                domain = get_domain(url)
                return get_outfilename(url, domain)
    sys.stderr.write("Failed to construct a single out filename.\n")
    return ""


def remove_file(filename):
    """Remove a file from disk."""
    try:
        os.remove(filename)
        return True
    except (OSError, IOError):
        return False


def modify_filename_id(filename):
    """Modify filename to have a unique numerical identifier."""
    split_filename = os.path.splitext(filename)
    id_num_re = re.compile("(\(\d\))")
    id_num = re.findall(id_num_re, split_filename[-2])
    if id_num:
        new_id_num = int(id_num[-1].lstrip("(").rstrip(")")) + 1

        # Reconstruct filename with incremented id and its extension
        filename = "".join(
            (
                re.sub(id_num_re, "({0})".format(new_id_num), split_filename[-2]),
                split_filename[-1],
            )
        )
    else:
        split_filename = os.path.splitext(filename)

        # Reconstruct filename with new id and its extension
        filename = "".join(("{0} (2)".format(split_filename[-2]), split_filename[-1]))
    return filename


def overwrite_file_check(args, filename):
    """If filename exists, overwrite or modify it to be unique."""
    if not args["overwrite"] and os.path.exists(filename):
        # Confirm overwriting of the file, or modify filename
        if args["no_overwrite"]:
            overwrite = False
        else:
            try:
                overwrite = confirm_input(
                    input("Overwrite {0}? (yes/no): ".format(filename))
                )
            except (KeyboardInterrupt, EOFError):
                sys.exit()
        if not overwrite:
            new_filename = modify_filename_id(filename)
            while os.path.exists(new_filename):
                new_filename = modify_filename_id(new_filename)
            return new_filename
    return filename


def print_text(args, infilenames, outfilename=None):
    """Print text content of infiles to stdout.

    Keyword arguments:
    args -- program arguments (dict)
    infilenames -- names of user-inputted and/or downloaded files (list)
    outfilename -- only used for interface purposes (None)
    """
    for infilename in infilenames:
        parsed_text = get_parsed_text(args, infilename)
        if parsed_text:
            for line in parsed_text:
                print(line)
            print("")


def write_pdf_files(args, infilenames, outfilename):
    """Write pdf file(s) to disk using pdfkit.

    Keyword arguments:
    args -- program arguments (dict)
    infilenames -- names of user-inputted and/or downloaded files (list)
    outfilename -- name of output pdf file (str)
    """
    if not outfilename.endswith(".pdf"):
        outfilename = outfilename + ".pdf"
    outfilename = overwrite_file_check(args, outfilename)

    options = {}
    try:
        if args["multiple"]:
            # Multiple files are written one at a time, so infilenames will
            # never contain more than one file here
            infilename = infilenames[0]
            if not args["quiet"]:
                print("Attempting to write to {0}.".format(outfilename))
            else:
                options["quiet"] = None

            if args["xpath"]:
                # Process HTML with XPath before writing
                html = parse_html(read_files(infilename), args["xpath"])
                if isinstance(html, list):
                    if isinstance(html[0], str):
                        pk.from_string("\n".join(html), outfilename, options=options)
                    else:
                        pk.from_string(
                            "\n".join(lh.tostring(x) for x in html),
                            outfilename,
                            options=options,
                        )
                elif isinstance(html, str):
                    pk.from_string(html, outfilename, options=options)
                else:
                    pk.from_string(lh.tostring(html), outfilename, options=options)
            else:
                pk.from_file(infilename, outfilename, options=options)
        elif args["single"]:
            if not args["quiet"]:
                print(
                    "Attempting to write {0} page(s) to {1}.".format(
                        len(infilenames), outfilename
                    )
                )
            else:
                options["quiet"] = None

            if args["xpath"]:
                # Process HTML with XPath before writing
                html = parse_html(read_files(infilenames), args["xpath"])
                if isinstance(html, list):
                    if isinstance(html[0], str):
                        pk.from_string("\n".join(html), outfilename, options=options)
                    else:
                        pk.from_string(
                            "\n".join(lh.tostring(x) for x in html),
                            outfilename,
                            options=options,
                        )
                elif isinstance(html, str):
                    pk.from_string(html, outfilename, options=options)
                else:
                    pk.from_string(lh.tostring(html), outfilename, options=options)
            else:
                pk.from_file(infilenames, outfilename, options=options)
        return True
    except (OSError, IOError) as err:
        sys.stderr.write(
            "An error occurred while writing {0}:\n{1}".format(outfilename, str(err))
        )
        return False


def write_csv_files(args, infilenames, outfilename):
    """Write csv file(s) to disk.

    Keyword arguments:
    args -- program arguments (dict)
    infilenames -- names of user-inputted and/or downloaded files (list)
    outfilename -- name of output text file (str)
    """

    def csv_convert(line):
        """Strip punctuation and insert commas"""
        clean_line = []
        for word in line.split(" "):
            clean_line.append(word.strip(string.punctuation))
        return ", ".join(clean_line)

    if not outfilename.endswith(".csv"):
        outfilename = outfilename + ".csv"
    outfilename = overwrite_file_check(args, outfilename)

    all_text = []  # Text must be aggregated if writing to a single output file
    for i, infilename in enumerate(infilenames):
        parsed_text = get_parsed_text(args, infilename)
        if parsed_text:
            if args["multiple"]:
                if not args["quiet"]:
                    print("Attempting to write to {0}.".format(outfilename))

                csv_text = [csv_convert(x) for x in parsed_text]
                print(csv_text)
                write_file(csv_text, outfilename)
            elif args["single"]:
                all_text += parsed_text
                # Newline added between multiple files being aggregated
                if len(infilenames) > 1 and i < len(infilenames) - 1:
                    all_text.append("\n")

    # Write all text to a single output file
    if args["single"] and all_text:
        if not args["quiet"]:
            print(
                "Attempting to write {0} page(s) to {1}.".format(
                    len(infilenames), outfilename
                )
            )

        csv_text = [csv_convert(x) for x in all_text]
        print(csv_text)
        write_file(csv_text, outfilename)


def write_text_files(args, infilenames, outfilename):
    """Write text file(s) to disk.

    Keyword arguments:
    args -- program arguments (dict)
    infilenames -- names of user-inputted and/or downloaded files (list)
    outfilename -- name of output text file (str)
    """
    if not outfilename.endswith(".txt"):
        outfilename = outfilename + ".txt"
    outfilename = overwrite_file_check(args, outfilename)

    all_text = []  # Text must be aggregated if writing to a single output file
    for i, infilename in enumerate(infilenames):
        parsed_text = get_parsed_text(args, infilename)
        if parsed_text:
            if args["multiple"]:
                if not args["quiet"]:
                    print("Attempting to write to {0}.".format(outfilename))
                write_file(parsed_text, outfilename)
            elif args["single"]:
                all_text += parsed_text
                # Newline added between multiple files being aggregated
                if len(infilenames) > 1 and i < len(infilenames) - 1:
                    all_text.append("\n")

    # Write all text to a single output file
    if args["single"] and all_text:
        if not args["quiet"]:
            print(
                "Attempting to write {0} page(s) to {1}.".format(
                    len(infilenames), outfilename
                )
            )
        write_file(all_text, outfilename)


def write_file(data, outfilename):
    """Write a single file to disk."""
    if not data:
        return False
    try:
        with open(outfilename, "w") as outfile:
            for line in data:
                if line:
                    outfile.write(line)
        return True
    except (OSError, IOError) as err:
        sys.stderr.write(
            "An error occurred while writing {0}:\n{1}".format(outfilename, str(err))
        )
        return False


def get_num_part_files():
    """Get the number of PART.html files currently saved to disk."""
    num_parts = 0
    for filename in os.listdir(os.getcwd()):
        if filename.startswith("PART") and filename.endswith(".html"):
            num_parts += 1
    return num_parts


def write_part_images(url, raw_html, html, filename):
    """Write image file(s) associated with HTML to disk, substituting filenames.

    Keywords arguments:
    url -- the URL from which the HTML has been extracted from (str)
    raw_html -- unparsed HTML file content (list)
    html -- parsed HTML file content (lxml.html.HtmlElement) (default: None)
    filename -- the PART.html filename (str)

    Return raw HTML with image names replaced with local image filenames.
    """
    save_dirname = "{0}_files".format(os.path.splitext(filename)[0])
    if not os.path.exists(save_dirname):
        os.makedirs(save_dirname)
    images = html.xpath("//img/@src")
    internal_image_urls = [x for x in images if x.startswith("/")]

    headers = {"User-Agent": random.choice(USER_AGENTS)}
    for img_url in images:
        img_name = img_url.split("/")[-1]
        if "?" in img_name:
            img_name = img_name.split("?")[0]
        if not os.path.splitext(img_name)[1]:
            img_name = "{0}.jpeg".format(img_name)

        try:
            full_img_name = os.path.join(save_dirname, img_name)
            with open(full_img_name, "wb") as img:
                if img_url in internal_image_urls:
                    # Internal images need base url added
                    full_img_url = "{0}{1}".format(url.rstrip("/"), img_url)
                else:
                    # External image
                    full_img_url = img_url
                img_content = requests.get(
                    full_img_url, headers=headers, proxies=get_proxies()
                ).content
                img.write(img_content)
                raw_html = raw_html.replace(escape(img_url), full_img_name)
        except (OSError, IOError):
            pass
        time.sleep(random.uniform(0, 0.5))  # Slight delay between downloads
    return raw_html


def write_part_file(args, url, raw_html, html=None, part_num=None):
    """Write PART.html file(s) to disk, images in PART_files directory.

    Keyword arguments:
    args -- program arguments (dict)
    raw_html -- unparsed HTML file content (list)
    html -- parsed HTML file content (lxml.html.HtmlElement) (default: None)
    part_num -- PART(#).html file number (int) (default: None)
    """
    if part_num is None:
        part_num = get_num_part_files() + 1
    filename = "PART{0}.html".format(part_num)

    # Decode bytes to string in Python 3 versions
    if not PY2 and isinstance(raw_html, bytes):
        raw_html = raw_html.encode("ascii", "ignore")

    # Convert html to an lh.HtmlElement object for parsing/saving images
    if html is None:
        html = lh.fromstring(raw_html)

    # Parse HTML if XPath entered
    if args["xpath"]:
        raw_html = parse_html(html, args["xpath"])
        if isinstance(raw_html, list):
            if not isinstance(raw_html[0], lh.HtmlElement):
                raise ValueError("XPath should return an HtmlElement object.")
        else:
            if not isinstance(raw_html, lh.HtmlElement):
                raise ValueError("XPath should return an HtmlElement object.")

    # Write HTML and possibly images to disk
    if raw_html:
        if not args["no_images"] and (args["pdf"] or args["html"]):
            raw_html = write_part_images(url, raw_html, html, filename)
        with open(filename, "w") as part:
            if not isinstance(raw_html, list):
                raw_html = [raw_html]
                if isinstance(raw_html[0], lh.HtmlElement):
                    for elem in raw_html:
                        part.write(lh.tostring(elem))
                else:
                    for line in raw_html:
                        part.write(line)


def get_part_filenames(num_parts=None, start_num=0):
    """Get numbered PART.html filenames."""
    if num_parts is None:
        num_parts = get_num_part_files()
    return ["PART{0}.html".format(i) for i in range(start_num + 1, num_parts + 1)]


def read_files(filenames):
    """Read a file into memory."""
    if isinstance(filenames, list):
        for filename in filenames:
            with open(filename, "r") as infile:
                return infile.read()
    else:
        with open(filenames, "r") as infile:
            return infile.read()


def remove_part_images(filename):
    """Remove PART(#)_files directory containing images from disk."""
    dirname = "{0}_files".format(os.path.splitext(filename)[0])
    if os.path.exists(dirname):
        shutil.rmtree(dirname)


def remove_part_files(num_parts=None):
    """Remove PART(#).html files and image directories from disk."""
    filenames = get_part_filenames(num_parts)
    for filename in filenames:
        remove_part_images(filename)
        remove_file(filename)


# User input and sanitation functions
#


def confirm_input(user_input):
    """Check user input for yes, no, or an exit signal."""
    if isinstance(user_input, list):
        user_input = "".join(user_input)

    try:
        u_inp = user_input.lower().strip()
    except AttributeError:
        u_inp = user_input

    # Check for exit signal
    if u_inp in ("q", "quit", "exit"):
        sys.exit()
    if u_inp in ("y", "yes"):
        return True
    return False


# Miscellaneous functions
#


def mkdir_and_cd(dirname):
    """Change directory and/or create it if necessary."""
    if not os.path.exists(dirname):
        os.makedirs(dirname)
        os.chdir(dirname)
    else:
        os.chdir(dirname)