"""scrape utility functions.

   Functions include:
   Web requests and requests caching
   Document caching
   Text processing
   HTML parsing
   URL processing
   File processing
   User input and sanitation

from __future__ import print_function
import glob
import hashlib
import os
import random
import re
import shutil
import string
import sys
import time

import lxml.html as lh

    import pdfkit as pk
except ImportError:
import requests
from requests.exceptions import MissingSchema
from six import PY2
from six.moves import input, xrange as range
from six.moves.urllib.parse import urlparse, urljoin
from six.moves.urllib.request import getproxies
import tldextract

if PY2:
    from cgi import escape
    from html import escape

    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) "
    "Gecko/20100101 Firefox/11.0",
    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) " "Gecko/20100 101 Firefox/22.0",
    "Mozilla/5.0 (Windows NT 6.1; rv:11.0) " "Gecko/20100101 Firefox/11.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) "
    "AppleWebKit/536.5 (KHTML, like Gecko) "
    "Chrome/19.0.1084.46 Safari/536.5",
    "Mozilla/5.0 (Windows; Windows NT 6.1) "
    "AppleWebKit/536.5 (KHTML, like Gecko) "
    "Chrome/19.0.1084.46 Safari/536.5",

XDG_CACHE_DIR = os.environ.get(
    "XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")
CACHE_DIR = os.path.join(XDG_CACHE_DIR, "scrape")
CACHE_FILE = os.path.join(CACHE_DIR, "cache{0}".format("" if PY2 else "3"))

# Web requests and requests caching functions

def get_proxies():
    """Get available proxies to use with requests library."""
    proxies = getproxies()
    filtered_proxies = {}
    for key, value in proxies.items():
        if key.startswith("http://"):
            if not value.startswith("http://"):
                filtered_proxies[key] = "http://{0}".format(value)
                filtered_proxies[key] = value
    return filtered_proxies

def get_resp(url):
    """Get webpage response as an lxml.html.HtmlElement object."""
        headers = {"User-Agent": random.choice(USER_AGENTS)}
            request = requests.get(url, headers=headers, proxies=get_proxies())
        except MissingSchema:
            url = add_protocol(url)
            request = requests.get(url, headers=headers, proxies=get_proxies())
        return lh.fromstring(request.text.encode("utf-8") if PY2 else request.text)
    except Exception:
        sys.stderr.write("Failed to retrieve {0}.\n".format(url))

def get_raw_resp(url):
    """Get webpage response as a unicode string."""
        headers = {"User-Agent": random.choice(USER_AGENTS)}
            request = requests.get(url, headers=headers, proxies=get_proxies())
        except MissingSchema:
            url = add_protocol(url)
            request = requests.get(url, headers=headers, proxies=get_proxies())
        return request.text.encode("utf-8") if PY2 else request.text
    except Exception:
        sys.stderr.write("Failed to retrieve {0} as str.\n".format(url))

def enable_cache():
    """Enable requests library cache."""
        import requests_cache
    except ImportError as err:
        sys.stderr.write("Failed to enable cache: {0}\n".format(str(err)))
    if not os.path.exists(CACHE_DIR):

def clear_cache():
    """Clear requests library cache."""
    for cache in glob.glob("{0}*".format(CACHE_FILE)):

# Document caching functions

def hash_text(text):
    """Return MD5 hash of a string."""
    md5 = hashlib.md5()
    return md5.hexdigest()

def cache_page(page_cache, page_hash, cache_size):
    """Add a page to the page cache."""
    if len(page_cache) > cache_size:

# Text processing functions

def re_filter(text, regexps):
    """Filter text using regular expressions."""
    if not regexps:
        return text

    matched_text = []
    compiled_regexps = [re.compile(x) for x in regexps]
    for line in text:
        if line in matched_text:

        for regexp in compiled_regexps:
            found = regexp.search(line)
            if found and found.group():

    return matched_text or text

def remove_whitespace(text):
    """Remove unnecessary whitespace while keeping logical structure.

    Keyword arguments:
    text -- text to remove whitespace from (list)

    Retain paragraph structure but remove other whitespace,
    such as between words on a line and at the start and end of the text.
    clean_text = []
    curr_line = ""
    # Remove any newlines that follow two lines of whitespace consecutively
    # Also remove whitespace at start and end of text
    while text:
        if not curr_line:
            # Find the first line that is not whitespace and add it
            curr_line = text.pop(0)
            while not curr_line.strip() and text:
                curr_line = text.pop(0)
            if curr_line.strip():
            # Filter the rest of the lines
            curr_line = text.pop(0)
            if not text:
                # Add the final line if it is not whitespace
                if curr_line.strip():

            if curr_line.strip():
                # If the current line is whitespace then make sure there is
                # no more than one consecutive line of whitespace following
                if not text[0].strip():
                    if len(text) > 1 and text[1].strip():

    # Now filter each individual line for extraneous whitespace
    cleaner_text = []
    for line in clean_text:
        clean_line = " ".join(line.split())
        if not clean_line.strip():
            clean_line += "\n"
    return cleaner_text

def parse_text(infile, xpath=None, filter_words=None, attributes=None):
    """Filter text using XPath, regex keywords, and tag attributes.

    Keyword arguments:
    infile -- HTML or text content to parse (list)
    xpath -- an XPath expression (str)
    filter_words -- regex keywords (list)
    attributes -- HTML tag attributes (list)

    Return a list of strings of text.
    infiles = []
    text = []
    if xpath is not None:
        infile = parse_html(infile, xpath)
        if isinstance(infile, list):
            if isinstance(infile[0], lh.HtmlElement):
                infiles = list(infile)
                text = [line + "\n" for line in infile]
        elif isinstance(infile, lh.HtmlElement):
            infiles = [infile]
            text = [infile]
        infiles = [infile]

    if attributes is not None:
        attributes = [clean_attr(x) for x in attributes]
        attributes = [x for x in attributes if x]
        attributes = ["text()"]

    if not text:
        text_xpath = "//*[not(self::script) and not(self::style)]"
        for attr in attributes:
            for infile in infiles:
                if isinstance(infile, lh.HtmlElement):
                    new_text = infile.xpath("{0}/{1}".format(text_xpath, attr))
                    # re.split preserves delimiters place in the list
                    new_text = [x for x in re.split("(\n)", infile) if x]
                text += new_text

    if filter_words is not None:
        text = re_filter(text, filter_words)
    return [
        "".join(x for x in line if x in string.printable)
        for line in remove_whitespace(text)
        if line

def get_parsed_text(args, infilename):
    """Parse and return text content of infiles.

    Keyword arguments:
    args -- program arguments (dict)
    infilenames -- name of user-inputted and/or downloaded file (str)

    Return a list of strings of text.
    parsed_text = []
    if infilename.endswith(".html"):
        # Convert HTML to lxml object for content parsing
        html = lh.fromstring(read_files(infilename))
        text = None
        html = None
        text = read_files(infilename)

    if html is not None:
        parsed_text = parse_text(
            html, args["xpath"], args["filter"], args["attributes"]
    elif text is not None:
        parsed_text = parse_text(text, args["xpath"], args["filter"])
        if not args["quiet"]:
            sys.stderr.write("Failed to parse text from {0}.\n".format(infilename))
    return parsed_text

# HTML parsing functions

def clean_attr(attr):
    """Append @ to attributes and resolve text -> text() for XPath."""
    if attr:
        if "text" in attr:
            return "text()"
            attr = attr.lstrip("@")
    if attr:
        return "@" + attr
    return None

def parse_html(infile, xpath):
    """Filter HTML using XPath."""
    if not isinstance(infile, lh.HtmlElement):
        infile = lh.fromstring(infile)
    infile = infile.xpath(xpath)
    if not infile:
        raise ValueError("XPath {0} returned no results.".format(xpath))
    return infile

# URL processing functions

def get_domain(url):
    """Get the domain of a URL using tldextract."""
    return tldextract.extract(url).domain

def add_protocol(url):
    """Add protocol to URL."""
    if not check_protocol(url):
        return "http://{0}".format(url)
    return url

def check_protocol(url):
    """Check URL for a protocol."""
    if url and (url.startswith("http://") or url.startswith("https://")):
        return True
    return False

def remove_protocol(url):
    """Remove protocol from URL."""
    if check_protocol(url):
        return url.replace("http://", "").replace("https://", "")
    return url

def clean_url(url, base_url=None):
    """Add base netloc and path to internal URLs and remove www, fragments."""
    parsed_url = urlparse(url)

    fragment = "{url.fragment}".format(url=parsed_url)
    if fragment:
        url = url.split(fragment)[0]

    # Identify internal URLs and fix their format
    netloc = "{url.netloc}".format(url=parsed_url)
    if base_url is not None and not netloc:
        parsed_base = urlparse(base_url)
        split_base = "{url.scheme}://{url.netloc}{url.path}/".format(url=parsed_base)
        url = urljoin(split_base, url)
        netloc = "{url.netloc}".format(url=urlparse(url))

    if "www." in netloc:
        url = url.replace(netloc, netloc.replace("www.", ""))
    return url.rstrip(string.punctuation)

def has_suffix(url):
    """Return whether the url has a suffix using tldextract."""
    return bool(tldextract.extract(url).suffix)

def add_url_suffix(url):
    """Add .com suffix to URL if none found."""
    url = url.rstrip("/")
    if not has_suffix(url):
        return "{0}.com".format(url)
    return url

# File processing functions

def get_outfilename(url, domain=None):
    """Construct the output filename from domain and end of path."""
    if domain is None:
        domain = get_domain(url)

    path = "{url.path}".format(url=urlparse(url))
    if "." in path:
        tail_url = path.split(".")[-2]
        tail_url = path

    if tail_url:
        if "/" in tail_url:
            tail_pieces = [x for x in tail_url.split("/") if x]
            tail_url = tail_pieces[-1]

        # Keep length of return string below or equal to max_len
        max_len = 24
        if domain:
            max_len -= len(domain) + 1
        if len(tail_url) > max_len:
            if "-" in tail_url:
                tail_pieces = [x for x in tail_url.split("-") if x]
                tail_url = tail_pieces.pop(0)
                if len(tail_url) > max_len:
                    tail_url = tail_url[:max_len]
                    # Add as many tail pieces that can fit
                    tail_len = 0
                    for piece in tail_pieces:
                        tail_len += len(piece)
                        if tail_len <= max_len:
                            tail_url += "-" + piece
                tail_url = tail_url[:max_len]

        if domain:
            return "{0}-{1}".format(domain, tail_url).lower()
        return tail_url
    return domain.lower()

def get_single_outfilename(args):
    """Use first possible entry in query as filename."""
    for arg in args["query"]:
        if arg in args["files"]:
            return (".".join(arg.split(".")[:-1])).lower()
        for url in args["urls"]:
            if arg.strip("/") in url:
                domain = get_domain(url)
                return get_outfilename(url, domain)
    sys.stderr.write("Failed to construct a single out filename.\n")
    return ""

def remove_file(filename):
    """Remove a file from disk."""
        return True
    except (OSError, IOError):
        return False

def modify_filename_id(filename):
    """Modify filename to have a unique numerical identifier."""
    split_filename = os.path.splitext(filename)
    id_num_re = re.compile("(\(\d\))")
    id_num = re.findall(id_num_re, split_filename[-2])
    if id_num:
        new_id_num = int(id_num[-1].lstrip("(").rstrip(")")) + 1

        # Reconstruct filename with incremented id and its extension
        filename = "".join(
                re.sub(id_num_re, "({0})".format(new_id_num), split_filename[-2]),
        split_filename = os.path.splitext(filename)

        # Reconstruct filename with new id and its extension
        filename = "".join(("{0} (2)".format(split_filename[-2]), split_filename[-1]))
    return filename

def overwrite_file_check(args, filename):
    """If filename exists, overwrite or modify it to be unique."""
    if not args["overwrite"] and os.path.exists(filename):
        # Confirm overwriting of the file, or modify filename
        if args["no_overwrite"]:
            overwrite = False
                overwrite = confirm_input(
                    input("Overwrite {0}? (yes/no): ".format(filename))
            except (KeyboardInterrupt, EOFError):
        if not overwrite:
            new_filename = modify_filename_id(filename)
            while os.path.exists(new_filename):
                new_filename = modify_filename_id(new_filename)
            return new_filename
    return filename

def print_text(args, infilenames, outfilename=None):
    """Print text content of infiles to stdout.

    Keyword arguments:
    args -- program arguments (dict)
    infilenames -- names of user-inputted and/or downloaded files (list)
    outfilename -- only used for interface purposes (None)
    for infilename in infilenames:
        parsed_text = get_parsed_text(args, infilename)
        if parsed_text:
            for line in parsed_text:

def write_pdf_files(args, infilenames, outfilename):
    """Write pdf file(s) to disk using pdfkit.

    Keyword arguments:
    args -- program arguments (dict)
    infilenames -- names of user-inputted and/or downloaded files (list)
    outfilename -- name of output pdf file (str)
    if not outfilename.endswith(".pdf"):
        outfilename = outfilename + ".pdf"
    outfilename = overwrite_file_check(args, outfilename)

    options = {}
        if args["multiple"]:
            # Multiple files are written one at a time, so infilenames will
            # never contain more than one file here
            infilename = infilenames[0]
            if not args["quiet"]:
                print("Attempting to write to {0}.".format(outfilename))
                options["quiet"] = None

            if args["xpath"]:
                # Process HTML with XPath before writing
                html = parse_html(read_files(infilename), args["xpath"])
                if isinstance(html, list):
                    if isinstance(html[0], str):
                        pk.from_string("\n".join(html), outfilename, options=options)
                            "\n".join(lh.tostring(x) for x in html),
                elif isinstance(html, str):
                    pk.from_string(html, outfilename, options=options)
                    pk.from_string(lh.tostring(html), outfilename, options=options)
                pk.from_file(infilename, outfilename, options=options)
        elif args["single"]:
            if not args["quiet"]:
                    "Attempting to write {0} page(s) to {1}.".format(
                        len(infilenames), outfilename
                options["quiet"] = None

            if args["xpath"]:
                # Process HTML with XPath before writing
                html = parse_html(read_files(infilenames), args["xpath"])
                if isinstance(html, list):
                    if isinstance(html[0], str):
                        pk.from_string("\n".join(html), outfilename, options=options)
                            "\n".join(lh.tostring(x) for x in html),
                elif isinstance(html, str):
                    pk.from_string(html, outfilename, options=options)
                    pk.from_string(lh.tostring(html), outfilename, options=options)
                pk.from_file(infilenames, outfilename, options=options)
        return True
    except (OSError, IOError) as err:
            "An error occurred while writing {0}:\n{1}".format(outfilename, str(err))
        return False

def write_csv_files(args, infilenames, outfilename):
    """Write csv file(s) to disk.

    Keyword arguments:
    args -- program arguments (dict)
    infilenames -- names of user-inputted and/or downloaded files (list)
    outfilename -- name of output text file (str)

    def csv_convert(line):
        """Strip punctuation and insert commas"""
        clean_line = []
        for word in line.split(" "):
        return ", ".join(clean_line)

    if not outfilename.endswith(".csv"):
        outfilename = outfilename + ".csv"
    outfilename = overwrite_file_check(args, outfilename)

    all_text = []  # Text must be aggregated if writing to a single output file
    for i, infilename in enumerate(infilenames):
        parsed_text = get_parsed_text(args, infilename)
        if parsed_text:
            if args["multiple"]:
                if not args["quiet"]:
                    print("Attempting to write to {0}.".format(outfilename))

                csv_text = [csv_convert(x) for x in parsed_text]
                write_file(csv_text, outfilename)
            elif args["single"]:
                all_text += parsed_text
                # Newline added between multiple files being aggregated
                if len(infilenames) > 1 and i < len(infilenames) - 1:

    # Write all text to a single output file
    if args["single"] and all_text:
        if not args["quiet"]:
                "Attempting to write {0} page(s) to {1}.".format(
                    len(infilenames), outfilename

        csv_text = [csv_convert(x) for x in all_text]
        write_file(csv_text, outfilename)

def write_text_files(args, infilenames, outfilename):
    """Write text file(s) to disk.

    Keyword arguments:
    args -- program arguments (dict)
    infilenames -- names of user-inputted and/or downloaded files (list)
    outfilename -- name of output text file (str)
    if not outfilename.endswith(".txt"):
        outfilename = outfilename + ".txt"
    outfilename = overwrite_file_check(args, outfilename)

    all_text = []  # Text must be aggregated if writing to a single output file
    for i, infilename in enumerate(infilenames):
        parsed_text = get_parsed_text(args, infilename)
        if parsed_text:
            if args["multiple"]:
                if not args["quiet"]:
                    print("Attempting to write to {0}.".format(outfilename))
                write_file(parsed_text, outfilename)
            elif args["single"]:
                all_text += parsed_text
                # Newline added between multiple files being aggregated
                if len(infilenames) > 1 and i < len(infilenames) - 1:

    # Write all text to a single output file
    if args["single"] and all_text:
        if not args["quiet"]:
                "Attempting to write {0} page(s) to {1}.".format(
                    len(infilenames), outfilename
        write_file(all_text, outfilename)

def write_file(data, outfilename):
    """Write a single file to disk."""
    if not data:
        return False
        with open(outfilename, "w") as outfile:
            for line in data:
                if line:
        return True
    except (OSError, IOError) as err:
            "An error occurred while writing {0}:\n{1}".format(outfilename, str(err))
        return False

def get_num_part_files():
    """Get the number of PART.html files currently saved to disk."""
    num_parts = 0
    for filename in os.listdir(os.getcwd()):
        if filename.startswith("PART") and filename.endswith(".html"):
            num_parts += 1
    return num_parts

def write_part_images(url, raw_html, html, filename):
    """Write image file(s) associated with HTML to disk, substituting filenames.

    Keywords arguments:
    url -- the URL from which the HTML has been extracted from (str)
    raw_html -- unparsed HTML file content (list)
    html -- parsed HTML file content (lxml.html.HtmlElement) (default: None)
    filename -- the PART.html filename (str)

    Return raw HTML with image names replaced with local image filenames.
    save_dirname = "{0}_files".format(os.path.splitext(filename)[0])
    if not os.path.exists(save_dirname):
    images = html.xpath("//img/@src")
    internal_image_urls = [x for x in images if x.startswith("/")]

    headers = {"User-Agent": random.choice(USER_AGENTS)}
    for img_url in images:
        img_name = img_url.split("/")[-1]
        if "?" in img_name:
            img_name = img_name.split("?")[0]
        if not os.path.splitext(img_name)[1]:
            img_name = "{0}.jpeg".format(img_name)

            full_img_name = os.path.join(save_dirname, img_name)
            with open(full_img_name, "wb") as img:
                if img_url in internal_image_urls:
                    # Internal images need base url added
                    full_img_url = "{0}{1}".format(url.rstrip("/"), img_url)
                    # External image
                    full_img_url = img_url
                img_content = requests.get(
                    full_img_url, headers=headers, proxies=get_proxies()
                raw_html = raw_html.replace(escape(img_url), full_img_name)
        except (OSError, IOError):
        time.sleep(random.uniform(0, 0.5))  # Slight delay between downloads
    return raw_html

def write_part_file(args, url, raw_html, html=None, part_num=None):
    """Write PART.html file(s) to disk, images in PART_files directory.

    Keyword arguments:
    args -- program arguments (dict)
    raw_html -- unparsed HTML file content (list)
    html -- parsed HTML file content (lxml.html.HtmlElement) (default: None)
    part_num -- PART(#).html file number (int) (default: None)
    if part_num is None:
        part_num = get_num_part_files() + 1
    filename = "PART{0}.html".format(part_num)

    # Decode bytes to string in Python 3 versions
    if not PY2 and isinstance(raw_html, bytes):
        raw_html = raw_html.encode("ascii", "ignore")

    # Convert html to an lh.HtmlElement object for parsing/saving images
    if html is None:
        html = lh.fromstring(raw_html)

    # Parse HTML if XPath entered
    if args["xpath"]:
        raw_html = parse_html(html, args["xpath"])
        if isinstance(raw_html, list):
            if not isinstance(raw_html[0], lh.HtmlElement):
                raise ValueError("XPath should return an HtmlElement object.")
            if not isinstance(raw_html, lh.HtmlElement):
                raise ValueError("XPath should return an HtmlElement object.")

    # Write HTML and possibly images to disk
    if raw_html:
        if not args["no_images"] and (args["pdf"] or args["html"]):
            raw_html = write_part_images(url, raw_html, html, filename)
        with open(filename, "w") as part:
            if not isinstance(raw_html, list):
                raw_html = [raw_html]
                if isinstance(raw_html[0], lh.HtmlElement):
                    for elem in raw_html:
                    for line in raw_html:

def get_part_filenames(num_parts=None, start_num=0):
    """Get numbered PART.html filenames."""
    if num_parts is None:
        num_parts = get_num_part_files()
    return ["PART{0}.html".format(i) for i in range(start_num + 1, num_parts + 1)]

def read_files(filenames):
    """Read a file into memory."""
    if isinstance(filenames, list):
        for filename in filenames:
            with open(filename, "r") as infile:
                return infile.read()
        with open(filenames, "r") as infile:
            return infile.read()

def remove_part_images(filename):
    """Remove PART(#)_files directory containing images from disk."""
    dirname = "{0}_files".format(os.path.splitext(filename)[0])
    if os.path.exists(dirname):

def remove_part_files(num_parts=None):
    """Remove PART(#).html files and image directories from disk."""
    filenames = get_part_filenames(num_parts)
    for filename in filenames:

# User input and sanitation functions

def confirm_input(user_input):
    """Check user input for yes, no, or an exit signal."""
    if isinstance(user_input, list):
        user_input = "".join(user_input)

        u_inp = user_input.lower().strip()
    except AttributeError:
        u_inp = user_input

    # Check for exit signal
    if u_inp in ("q", "quit", "exit"):
    if u_inp in ("y", "yes"):
        return True
    return False

# Miscellaneous functions

def mkdir_and_cd(dirname):
    """Change directory and/or create it if necessary."""
    if not os.path.exists(dirname):