python source code of webpage

#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
import re
from HTMLParser import HTMLParseError
from time import time
from urlparse import urlparse

import requests
from bs4 import BeautifulSoup

from app import logger
from http_cache import http_get
from http_cache import is_response_too_large
from oa_local import find_normalized_license
from open_location import OpenLocation
from util import NoDoiException
from util import elapsed
from util import get_link_target
from util import get_tree
from util import is_same_publisher

DEBUG_SCRAPING = os.getenv('DEBUG_SCRAPING', False)


# it matters this is just using the header, because we call it even if the content
# is too large.  if we start looking in content, need to break the pieces apart.
def is_pdf_from_header(response):
    looks_good = False
    for k, v in response.headers.iteritems():
        if v:
            key = k.lower()
            val = v.lower()

            if key == "content-type" and "application/pdf" in val:
                looks_good = True

            if key == 'content-disposition' and "pdf" in val:
                looks_good = True
            try:
                if key == 'content-length' and int(val) < 128:
                    looks_good = False
                    break
            except ValueError:
                logger.error(u'got a nonnumeric content-length header: {}'.format(val))
                looks_good = False
                break
    return looks_good


def is_a_pdf_page(response, page_publisher):
    if is_pdf_from_header(response):
        if DEBUG_SCRAPING:
            logger.info(u"http header says this is a PDF {}".format(
                response.request.url)
            )
        return True

    # everything below here needs to look at the content
    # so bail here if the page is too big
    if is_response_too_large(response):
        if DEBUG_SCRAPING:
            logger.info(u"response is too big for more checks in is_a_pdf_page")
        return False

    content = response.content_big()

    # PDFs start with this character
    if re.match(u"%PDF", content):
        return True

    if page_publisher:
        says_free_publisher_patterns = [
            ("Wiley-Blackwell", u'<span class="freeAccess" title="You have free access to this content">'),
            ("Wiley-Blackwell", u'<iframe id="pdfDocument"'),
            ("JSTOR", ur'<li class="download-pdf-button">.*Download PDF.*</li>'),
            ("Institute of Electrical and Electronics Engineers (IEEE)",
             ur'<frame src="http://ieeexplore.ieee.org/.*?pdf.*?</frameset>'),
            ("IOP Publishing", ur'Full Refereed Journal Article')
        ]
        for (publisher, pattern) in says_free_publisher_patterns:
            matches = re.findall(pattern, content, re.IGNORECASE | re.DOTALL)
            if is_same_publisher(page_publisher, publisher) and matches:
                return True
    return False


def is_a_word_doc_from_header(response):
    looks_good = False
    for k, v in response.headers.iteritems():
        if v:
            key = k.lower()
            val = v.lower()

            if key == "content-type" and (
                    "application/msword" in val or
                    "application/doc" in val or
                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document" in val
            ):
                looks_good = True

            try:
                if key == 'content-length' and int(val) < 512:
                    looks_good = False
                    break

            except ValueError:
                logger.error(u'got a nonnumeric content-length header: {}'.format(val))
                looks_good = False
                break
    return looks_good


def is_a_word_doc(response):
    if is_a_word_doc_from_header(response):
        if DEBUG_SCRAPING:
            logger.info(u"http header says this is a word doc {}".format(response.request.url))
        return True

    # everything below here needs to look at the content
    # so bail here if the page is too big
    if is_response_too_large(response):
        if DEBUG_SCRAPING:
            logger.info(u"response is too big for more checks in is_a_word_doc")
        return False

    content = response.content_big()

    # docx
    if content[-22:].startswith('PK'):
        return True

    # doc
    if content.startswith('\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1'):
        return True

    return False

class Webpage(object):
    def __init__(self, **kwargs):
        self.url = None
        self.scraped_pdf_url = None
        self.scraped_open_metadata_url = None
        self.scraped_license = None
        self.error = ""
        self.related_pub_doi = None
        self.related_pub_publisher = None
        self.match_type = None
        self.session_id = None
        self.endpoint_id = None
        self.base_id = None
        self.base_doc = None
        self.resolved_url = None
        self.r = None
        for (k, v) in kwargs.iteritems():
            self.__setattr__(k, v)
        if not self.url:
            self.url = u"http://doi.org/{}".format(self.doi)

    # from https://stackoverflow.com/a/865272/596939
    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        pass

    @property
    def doi(self):
        return self.related_pub_doi

    # sometimes overriden, for publisherwebpage
    @property
    def ask_slowly(self):
        return False

    @property
    def publisher(self):
        return self.related_pub_publisher

    def is_same_publisher(self, publisher):
        return is_same_publisher(self.related_pub_publisher, publisher)

    @property
    def fulltext_url(self):
        if self.scraped_pdf_url:
            return self.scraped_pdf_url
        if self.scraped_open_metadata_url:
            return self.scraped_open_metadata_url
        if self.is_open:
            return self.url
        return None

    @property
    def has_fulltext_url(self):
        if self.scraped_pdf_url or self.scraped_open_metadata_url:
            return True
        return False

    @property
    def is_open(self):
        # just having the license isn't good enough
        if self.scraped_pdf_url or self.scraped_open_metadata_url:
            return True
        return False

    def mint_open_location(self):
        my_location = OpenLocation()
        my_location.pdf_url = self.scraped_pdf_url
        my_location.metadata_url = self.scraped_open_metadata_url
        my_location.license = self.scraped_license
        my_location.doi = self.related_pub_doi
        my_location.evidence = self.open_version_source_string
        my_location.match_type = self.match_type
        my_location.pmh_id = self.base_id
        my_location.endpoint_id = self.endpoint_id
        my_location.base_doc = self.base_doc
        my_location.error = ""
        if self.is_open and not my_location.best_url:
            my_location.metadata_url = self.url
        return my_location

    def set_r_for_pdf(self):
        self.r = None
        try:
            self.r = http_get(url=self.scraped_pdf_url, stream=False, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in set_r_for_pdf"
            logger.info(self.error)
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in set_r_for_pdf: {}".format(self.scraped_pdf_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except Exception as e:
            self.error += u"ERROR: Exception error in set_r_for_pdf"
            logger.exception(self.error)

    def is_a_pdf_page(self):
        return is_a_pdf_page(self.r, self.publisher)

    def gets_a_pdf(self, link, base_url):

        if is_purchase_link(link):
            return False

        absolute_url = get_link_target(link.href, base_url)
        if DEBUG_SCRAPING:
            logger.info(u"checking to see if {} is a pdf".format(absolute_url))

        start = time()
        try:
            self.r = http_get(absolute_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # is unauthorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in gets_a_pdf".format(self.r.status_code, absolute_url)
                return False

            if self.is_a_pdf_page():
                return True

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error in gets_a_pdf"
            logger.info(self.error)
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error in gets_a_pdf for {}: {}".format(absolute_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
        except Exception as e:
            self.error += u"ERROR: Exception error in gets_a_pdf"
            logger.exception(self.error)

        if DEBUG_SCRAPING:
            logger.info(u"we've decided this ain't a PDF. took {} seconds [{}]".format(
                elapsed(start), absolute_url))
        return False

    def gets_a_word_doc(self, link, base_url):
        if is_purchase_link(link):
            return False

        absolute_url = get_link_target(link.href, base_url)
        if DEBUG_SCRAPING:
            logger.info(u"checking to see if {} is a word doc".format(absolute_url))

        start = time()
        try:
            r = http_get(absolute_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)

            if r.status_code != 200:
                return False

            if is_a_word_doc(r):
                return True

        except Exception as e:
            logger.exception(u'error in gets_a_word_doc: {}'.format(e))

        return False

    def is_known_bad_link(self, link):
        if re.search(ur'^https?://repositorio\.uchile\.cl/handle', self.url):
            # these are abstracts
            return re.search(ur'item_\d+\.pdf', link.href or u'')

        if re.search(ur'^https?://dial\.uclouvain\.be', self.r.url):
            # disclaimer parameter is an unstable key
            return re.search(ur'downloader\.php\?.*disclaimer=', link.href or u'')

        if re.search(ur'^https?://(?:www)?\.goodfellowpublishers\.com', self.r.url):
            return re.search(ur'free_files/', link.href or u'', re.IGNORECASE)

        if re.search(ur'^https?://(?:www)?\.intellectbooks\.com', self.r.url):
            return re.search(ur'_nfc', link.href or u'', re.IGNORECASE)

        if re.search(ur'^https?://philpapers.org/rec/FISBAI', self.r.url):
            return link.href and link.href.endswith(u'FISBAI.pdf')

        bad_meta_pdf_links = [
            ur'^https?://cora\.ucc\.ie/bitstream/', # https://cora.ucc.ie/handle/10468/3838
            ur'^https?://zefq-journal\.com/',  # https://zefq-journal.com/article/S1865-9217(09)00200-1/pdf
            ur'^https?://www\.nowpublishers\.com/', # https://www.nowpublishers.com/article/Details/ENT-062
        ]

        if link.anchor == '<meta citation_pdf_url>':
            for url_pattern in bad_meta_pdf_links:
                if re.search(url_pattern, link.href or u''):
                    return True

        bad_meta_pdf_sites = [
            # https://researchonline.federation.edu.au/vital/access/manager/Repository/vital:11142
            ur'^https?://researchonline\.federation\.edu\.au/vital/access/manager/Repository/',
            ur'^https?://www.dora.lib4ri.ch/[^/]*/islandora/object/',
            ur'^https?://ifs\.org\.uk/publications/', # https://ifs.org.uk/publications/14795
        ]

        if link.anchor == '<meta citation_pdf_url>':
            for url_pattern in bad_meta_pdf_sites:
                if re.search(url_pattern, self.r.url or u''):
                    return True

        return False

    def filter_link(self, link):
        return None if not link or self.is_known_bad_link(link) else link

    def find_pdf_link(self, page):

        if DEBUG_SCRAPING:
            logger.info(u"in find_pdf_link in {}".format(self.url))

        # before looking in links, look in meta for the pdf link
        # = open journal http://onlinelibrary.wiley.com/doi/10.1111/j.1461-0248.2011.01645.x/abstract
        # = open journal http://doi.org/10.1002/meet.2011.14504801327
        # = open repo http://hdl.handle.net/10088/17542
        # = open http://handle.unsw.edu.au/1959.4/unsworks_38708 cc-by

        # logger.info(page)

        links = [get_pdf_in_meta(page)] + [get_pdf_from_javascript(page)] + get_useful_links(page)

        for link in [x for x in links if x is not None]:
            if DEBUG_SCRAPING:
                logger.info(u"trying {}, {} in find_pdf_link".format(link.href, link.anchor))

            if self.is_known_bad_link(link):
                continue

            # there are some links that are SURELY NOT the pdf for this article
            if has_bad_anchor_word(link.anchor):
                continue

            # there are some links that are SURELY NOT the pdf for this article
            if has_bad_href_word(link.href):
                continue

            # don't include links with newlines
            if link.href and u"\n" in link.href:
                continue

            if link.href.startswith(u'#'):
                continue

            # download link ANCHOR text is something like "manuscript.pdf" or like "PDF (1 MB)"
            # = open repo http://hdl.handle.net/1893/372
            # = open repo https://research-repository.st-andrews.ac.uk/handle/10023/7421
            # = open repo http://dro.dur.ac.uk/1241/
            if link.anchor and "pdf" in link.anchor.lower():
                return link

            # button says download
            # = open repo https://works.bepress.com/ethan_white/45/
            # = open repo http://ro.uow.edu.au/aiimpapers/269/
            # = open repo http://eprints.whiterose.ac.uk/77866/
            if "download" in link.anchor:
                if "citation" in link.anchor:
                    pass
                else:
                    return link

            # want it to match for this one https://doi.org/10.2298/SGS0603181L
            # but not this one: 10.1097/00003643-201406001-00238
            if self.publisher and not self.is_same_publisher("Ovid Technologies (Wolters Kluwer Health)"):
                if link.anchor and "full text" in link.anchor.lower():
                    return link

            # download link is identified with an image
            for img in link.findall(".//img"):
                try:
                    if "pdf" in img.attrib["src"].lower() or "pdf" in img.attrib["class"].lower():
                        return link
                except KeyError:
                    pass

            try:
                if "pdf" in link.attrib["title"].lower():
                    return link
                if "download/pdf" in link.href:
                    return link
            except KeyError:
                pass

            anchor = link.anchor or ''
            href = link.href or ''
            version_labels = ['submitted version', 'accepted version', 'published version']

            if anchor.lower() in version_labels and href.lower().endswith('.pdf'):
                return link

        return None



    def __repr__(self):
        return u"<{} ({}) {}>".format(self.__class__.__name__, self.url, self.is_open)


class PublisherWebpage(Webpage):
    open_version_source_string = u"publisher landing page"

    @property
    def ask_slowly(self):
        return True

    @staticmethod
    def use_resolved_landing_url(resolved_url):
        resolved_hostname = urlparse(resolved_url).hostname
        return resolved_hostname and resolved_hostname.endswith('journals.lww.com')

    def is_known_bad_link(self, link):
        if super(PublisherWebpage, self).is_known_bad_link(link):
            return True

        if re.search(ur'^https?://www.reabic.net/journals/bir/', self.r.url):
            # doi.org urls go to issue page with links for all articles, e.g. https://doi.org/10.3391/bir.2019.8.1.08
            return True

        if re.search(ur'^https?://nnw.cz', self.r.url):
            # doi.org urls go to issue page with links for all articles, e.g. http://nnw.cz/obsahy15.html#25.033
            return True

        return False

    def _trust_pdf_landing_pages(self):
        if is_same_publisher(self.publisher, 'Oxford University Press (OUP)'):
            return False

        return True

    def scrape_for_fulltext_link(self, find_pdf_link=True):
        landing_url = self.url

        if DEBUG_SCRAPING:
            logger.info(u"checking to see if {} says it is open".format(landing_url))

        start = time()
        try:
            self.r = http_get(landing_url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)
            self.resolved_url = self.r.url
            resolved_host = urlparse(self.resolved_url).hostname or u''

            metadata_url = self.resolved_url if self.use_resolved_landing_url(self.resolved_url) else landing_url

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # is unauthorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link, skipping.".format(self.r.status_code, self.r.url)
                logger.info(u"DIDN'T GET THE PAGE: {}".format(self.error))
                # logger.debug(self.r.request.headers)
                return

            # example 10.1007/978-3-642-01445-1
            if u"crossref.org/_deleted-doi/" in self.resolved_url:
                logger.info(u"this is a deleted doi")
                return

            # if our landing_url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if self._trust_pdf_landing_pages():
                    if DEBUG_SCRAPING:
                        logger.info(u"this is a PDF. success! [{}]".format(landing_url))
                    self.scraped_pdf_url = landing_url
                    self.open_version_source_string = "open (via free pdf)"
                elif DEBUG_SCRAPING:
                    logger.info(u"landing page is an untrustworthy PDF {}".format(landing_url))
                # don't bother looking for open access lingo because it is a PDF (or PDF wannabe)
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(u"landing page is not a PDF for {}.  continuing more checks".format(landing_url))

            # get the HTML tree
            page = self.r.content_small()

            # get IEEE PDF from script. we might need it later.
            ieee_pdf = resolved_host.endswith(u'ieeexplore.ieee.org') and re.search(ur'"pdfPath":\s*"(/ielx?7/[\d/]*\.pdf)"', page)

            try:
                soup = BeautifulSoup(page, 'html.parser')
                [script.extract() for script in soup('script')]
                [div.extract() for div in soup.find_all("div", {'class': 'table-of-content'})]

                if self.is_same_publisher('Wiley'):
                    [div.extract() for div in soup.find_all('div', {'class': 'hubpage-menu'})]

                page = str(soup)
            except HTMLParseError as e:
                logger.error(u'error parsing html, skipped script removal: {}'.format(e))

            # Look for a pdf link. If we find one, look for a license.

            pdf_download_link = self.find_pdf_link(page) if find_pdf_link else None

            # if we haven't found a pdf yet, try known patterns
            if pdf_download_link is None:
                if ieee_pdf:
                    pdf_download_link = DuckLink(ieee_pdf.group(1).replace('iel7', 'ielx7'), 'download')

            if pdf_download_link is not None:
                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_pdf_url = pdf_url
                    self.scraped_open_metadata_url = metadata_url
                    self.open_version_source_string = "open (via free pdf)"

                    # set the license if we can find one
                    scraped_license = _trust_publisher_license(self.resolved_url) and find_normalized_license(page)
                    if scraped_license:
                        self.scraped_license = scraped_license

            # Look for patterns that indicate availability but not necessarily openness and make this a bronze location.

            bronze_url_snippet_patterns = [
                ('sciencedirect.com/', u'<div class="OpenAccessLabel">open archive</div>'),
                ('onlinelibrary.wiley.com', u'<div[^>]*class="doi-access"[^>]*>Free Access</div>'),
                ('openedition.org', ur'<span[^>]*id="img-freemium"[^>]*></span>'),
                ('openedition.org', ur'<span[^>]*id="img-openaccess"[^>]*></span>'),
                # landing page html is invalid: <span class="accesstext"></span>Free</span>
                ('microbiologyresearch.org', ur'<span class="accesstext">(?:</span>)?Free'),
                ('journals.lww.com', ur'<li[^>]*id="[^"]*-article-indicators-free"[^>]*>'),
                ('ashpublications.org', ur'<i[^>]*class="[^"]*icon-availability_free'),
            ]

            for (url_snippet, pattern) in bronze_url_snippet_patterns:
                if url_snippet in self.resolved_url.lower() and re.findall(pattern, page, re.IGNORECASE | re.DOTALL):
                    self.scraped_open_metadata_url = metadata_url
                    self.open_version_source_string = "open (via free article)"

            bronze_publisher_patterns = [
                ("New England Journal of Medicine (NEJM/MMS)", u'<meta content="yes" name="evt-free"'),
                ("Massachusetts Medical Society", u'<meta content="yes" name="evt-free"'),
            ]

            for (publisher, pattern) in bronze_publisher_patterns:
                if self.is_same_publisher(publisher) and re.findall(pattern, page, re.IGNORECASE | re.DOTALL):
                    self.scraped_open_metadata_url = metadata_url
                    self.open_version_source_string = "open (via free article)"

            bronze_citation_pdf_patterns = [
                r'^https?://www\.sciencedirect\.com/science/article/pii/S[0-9X]+/pdf(?:ft)?\?md5=[0-9a-f]+.*[0-9x]+-main.pdf$'
            ]

            citation_pdf_link = get_pdf_in_meta(page)

            if citation_pdf_link and citation_pdf_link.href:
                for pattern in bronze_citation_pdf_patterns:
                    if re.findall(pattern, citation_pdf_link.href, re.IGNORECASE | re.DOTALL):
                        logger.info(u'found bronzish citation_pdf_url {}'.format(citation_pdf_link.href))
                        self.scraped_open_metadata_url = metadata_url
                        self.open_version_source_string = "open (via free article)"

            # Look for some license-like patterns that make this a hybrid location.

            hybrid_url_snippet_patterns = [
                ('projecteuclid.org/', u'<strong>Full-text: Open access</strong>'),
                ('sciencedirect.com/', u'<div class="OpenAccessLabel">open access</div>'),
                ('journals.ametsoc.org/', ur'src="/templates/jsp/_style2/_ams/images/access_free\.gif"'),
                ('apsjournals.apsnet.org', ur'src="/products/aps/releasedAssets/images/open-access-icon\.png"'),
                ('psychiatriapolska.pl', u'is an Open Access journal:'),
                ('journals.lww.com', u'<span class="[^>]*ejp-indicator--free'),
                ('journals.lww.com', ur'<img[^>]*src="[^"]*/icon-access-open\.gif"[^>]*>'),
                ('iospress.com', ur'<img[^>]*src="[^"]*/img/openaccess_icon.png[^"]*"[^>]*>'),
                ('rti.org/', ur'</svg>[^<]*Open Access[^<]*</span>'),
            ]

            for (url_snippet, pattern) in hybrid_url_snippet_patterns:
                if url_snippet in self.resolved_url.lower() and re.findall(pattern, page, re.IGNORECASE | re.DOTALL):
                    self.scraped_open_metadata_url = metadata_url
                    self.open_version_source_string = "open (via page says Open Access)"
                    self.scraped_license = "implied-oa"

            hybrid_publisher_patterns = [
                ("Informa UK Limited", u"/accessOA.png"),
                ("Oxford University Press (OUP)", u"<i class='icon-availability_open'"),
                ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"isOpenAccess":true'),
                ("Institute of Electrical and Electronics Engineers (IEEE)", ur'"openAccessFlag":"yes"'),
                ("Informa UK Limited", u"/accessOA.png"),
                ("Royal Society of Chemistry (RSC)", u"/open_access_blue.png"),
                ("Cambridge University Press (CUP)", u'<span class="icon access open-access cursorDefault">'),
                ("Wiley", ur'<div[^>]*class="doi-access"[^>]*>Open Access</div>'),
            ]

            for (publisher, pattern) in hybrid_publisher_patterns:
                if self.is_same_publisher(publisher) and re.findall(pattern, page, re.IGNORECASE | re.DOTALL):
                    self.scraped_open_metadata_url = metadata_url
                    self.open_version_source_string = "open (via page says Open Access)"
                    self.scraped_license = "implied-oa"

            # Look for more license-like patterns that make this a hybrid location.
            # Extract the specific license if present.

            license_patterns = [
                ur"(creativecommons.org/licenses/[a-z\-]+)",
                u"distributed under the terms (.*) which permits",
                u"This is an open access article under the terms (.*) which permits",
                u"This is an open-access article distributed under the terms (.*), where it is permissible",
                u"This is an open access article published under (.*) which permits",
                u'<div class="openAccess-articleHeaderContainer(.*?)</div>',
                ur'this article is published under the creative commons (.*) licence',
            ]

            if _trust_publisher_license(self.resolved_url):
                for pattern in license_patterns:
                    matches = re.findall(pattern, page, re.IGNORECASE)
                    if matches:
                        self.scraped_open_metadata_url = metadata_url
                        normalized_license = find_normalized_license(matches[0])
                        self.scraped_license = normalized_license or 'implied-oa'
                        if normalized_license:
                            self.open_version_source_string = 'open (via page says license)'
                        else:
                            self.open_version_source_string = 'open (via page says Open Access)'

            if self.is_open:
                if DEBUG_SCRAPING:
                    logger.info(u"we've decided this is open! took {} seconds [{}]".format(
                        elapsed(start), landing_url))
                return True
            else:
                if DEBUG_SCRAPING:
                    logger.info(u"we've decided this doesn't say open. took {} seconds [{}]".format(
                        elapsed(start), landing_url))
                return False
        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException error in scrape_for_fulltext_link"
            logger.info(self.error)
            return False
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error in scrape_for_fulltext_link on {}: {}".format(landing_url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return False
        except Exception as e:
            self.error += u"ERROR: Exception error in scrape_for_fulltext_link"
            logger.exception(self.error)
            return False


def _trust_repo_license(resolved_url):
    hostname = urlparse(resolved_url).hostname
    if not hostname:
        return False

    trusted_hosts = ['babel.hathitrust.org']

    for host in trusted_hosts:
        if hostname.endswith(host):
            return True

    return False


def _try_pdf_link_as_doc(resolved_url):
    hostname = urlparse(resolved_url).hostname
    if not hostname:
        return False

    doc_hosts = ['paleorxiv.org']

    for host in doc_hosts:
        if hostname.endswith(host):
            return True

    return False


def _trust_publisher_license(resolved_url):
    hostname = urlparse(resolved_url).hostname
    if not hostname:
        return True

    untrusted_hosts = [
        'indianjournalofmarketing.com',
        'rupress.org',
        'rnajournal.cshlp.org',
        'press.umich.edu',
        'genome.cshlp.org',
    ]

    for host in untrusted_hosts:
        if hostname.endswith(host):
            logger.info(u'not trusting license from {}'.format(host))
            return False

    return True


# abstract.  inherited by PmhRepoWebpage
class RepoWebpage(Webpage):
    @property
    def open_version_source_string(self):
        return self.base_open_version_source_string

    def scrape_for_fulltext_link(self, find_pdf_link=True):
        url = self.url

        dont_scrape_list = [
                u"ncbi.nlm.nih.gov",
                u"europepmc.org",
                u"/europepmc/",
                u"pubmed",
                u"elar.rsvpu.ru",  #these ones based on complaint in email
                u"elib.uraic.ru",
                u"elar.usfeu.ru",
                u"elar.urfu.ru",
                u"elar.uspu.ru"]
        for url_fragment in dont_scrape_list:
            if url_fragment in url:
                logger.info(u"not scraping {} because is on our do not scrape list.".format(url))
                return

        try:
            self.r = http_get(url, stream=True, publisher=self.publisher, session_id=self.session_id, ask_slowly=self.ask_slowly)
            self.resolved_url = self.r.url

            if self.r.status_code != 200:
                if self.r.status_code in [401]:
                    # not authorized, so not open
                    pass
                else:
                    self.error += u"ERROR: status_code={} on {} in scrape_for_fulltext_link".format(self.r.status_code, url)
                return

            # if our url redirects to a pdf, we're done.
            # = open repo http://hdl.handle.net/2060/20140010374
            if self.is_a_pdf_page():
                if accept_direct_pdf_links(self.resolved_url):
                    if DEBUG_SCRAPING:
                        logger.info(u"this is a PDF. success! [{}]".format(self.resolved_url))
                    self.scraped_pdf_url = url
                else:
                    if DEBUG_SCRAPING:
                        logger.info(u"ignoring direct pdf link".format(self.resolved_url))
                return

            else:
                if DEBUG_SCRAPING:
                    logger.info(u"is not a PDF for {}.  continuing more checks".format(url))

            if is_a_word_doc(self.r):
                if DEBUG_SCRAPING:
                    logger.info(u"this is a word doc. success! [{}]".format(url))
                self.scraped_open_metadata_url = url
                return

            # now before reading the content, bail it too large
            if is_response_too_large(self.r):
                logger.info(u"landing page is too large, skipping")
                return

            # get the HTML tree
            page = self.r.content_small()

            # remove script tags
            try:
                soup = BeautifulSoup(page, 'html.parser')
                [script.extract() for script in soup('script')]
                page = str(soup)
            except HTMLParseError as e:
                logger.error(u'error parsing html, skipped script removal: {}'.format(e))

            # set the license if we can find one
            scraped_license = find_normalized_license(page)
            if scraped_license:
                self.scraped_license = scraped_license

            pdf_download_link = None
            # special exception for citeseer because we want the pdf link where
            # the copy is on the third party repo, not the cached link, if we can get it
            if url and u"citeseerx.ist.psu.edu/" in url:
                matches = re.findall(u'<h3>Download Links</h3>.*?href="(.*?)"', page, re.DOTALL)
                if matches:
                    pdf_download_link = DuckLink(unicode(matches[0], "utf-8"), "download")

            # osf doesn't have their download link in their pages
            # so look at the page contents to see if it is osf-hosted
            # if so, compute the url.  example:  http://osf.io/tyhqm
            elif page and u"osf-cookie" in unicode(page, "utf-8", errors='replace'):
                pdf_download_link = DuckLink(u"{}/download".format(url), "download")

            # otherwise look for it the normal way
            else:
                pdf_download_link = self.find_pdf_link(page)

            if pdf_download_link is None:
                if re.search(ur'https?://cdm21054\.contentdm\.oclc\.org/digital/collection/IR/id/(\d+)', self.resolved_url):
                    pdf_download_link = DuckLink(
                        '/digital/api/collection/IR/id/{}/download'.format(
                            re.search(
                                ur'https?://cdm21054\.contentdm\.oclc\.org/digital/collection/IR/id/(\d+)',
                                self.resolved_url
                            ).group(1)
                        ),
                        'download'
                    )

            if pdf_download_link is not None:
                if DEBUG_SCRAPING:
                    logger.info(u"found a PDF download link: {} {} [{}]".format(
                        pdf_download_link.href, pdf_download_link.anchor, url))

                pdf_url = get_link_target(pdf_download_link.href, self.r.url)
                # if they are linking to a PDF, we need to follow the link to make sure it's legit
                if DEBUG_SCRAPING:
                    logger.info(u"checking to see the PDF link actually gets a PDF [{}]".format(url))

                if (pdf_download_link.anchor == u'<meta citation_pdf_url>' and
                    re.match(r'https?://(www\.)?osti\.gov/servlets/purl/[0-9]+', pdf_url)):
                        # try the pdf URL with cookies
                        osti_pdf_response = http_get(
                            pdf_url, stream=True, publisher=self.publisher,
                            session_id=self.session_id, ask_slowly=self.ask_slowly, cookies=self.r.cookies
                        )

                        if is_a_pdf_page(osti_pdf_response, self.publisher):
                            self.scraped_open_metadata_url = url
                            direct_pdf_url = osti_pdf_response.url

                            # make sure the resolved PDF URL works without cookies before saving it
                            direct_pdf_response = http_get(
                                direct_pdf_url, stream=True, publisher=self.publisher,
                                session_id=self.session_id, ask_slowly=self.ask_slowly
                            )

                            if is_a_pdf_page(direct_pdf_response, self.publisher):
                                self.scraped_pdf_url = osti_pdf_response.url
                                self.r = direct_pdf_response

                        return

                if self.gets_a_pdf(pdf_download_link, self.r.url):
                    self.scraped_open_metadata_url = url
                    if not _discard_pdf_url(pdf_url):
                        self.scraped_pdf_url = pdf_url
                    return


            # try this later because would rather get a pdfs
            # if they are linking to a .docx or similar, this is open.
            doc_link = find_doc_download_link(page)
            if doc_link is None and _try_pdf_link_as_doc(self.resolved_url):
                doc_link = pdf_download_link

            if doc_link is not None:
                absolute_doc_url = get_link_target(doc_link.href, self.resolved_url)
                if DEBUG_SCRAPING:
                    logger.info(u"found a possible .doc download link [{}]".format(absolute_doc_url))
                if self.gets_a_word_doc(doc_link, self.r.url):
                    if DEBUG_SCRAPING:
                        logger.info(u"we've decided this is a word doc. [{}]".format(absolute_doc_url))
                    self.scraped_open_metadata_url = url
                    return
                else:
                    if DEBUG_SCRAPING:
                        logger.info(u"we've decided this ain't a word doc. [{}]".format(absolute_doc_url))

            bhl_link = find_bhl_view_link(self.resolved_url, page)
            if bhl_link is not None:
                logger.info('found a BHL document link: {}'.format(get_link_target(bhl_link.href, self.resolved_url)))
                self.scraped_open_metadata_url = url
                return

            if _trust_repo_license(self.resolved_url) and self.scraped_license:
                logger.info(u'trusting license {}'.format(self.scraped_license))
                self.scraped_open_metadata_url = self.url

        except requests.exceptions.ConnectionError as e:
            self.error += u"ERROR: connection error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.Timeout as e:
            self.error += u"ERROR: timeout error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.InvalidSchema as e:
            self.error += u"ERROR: InvalidSchema error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except requests.exceptions.RequestException as e:
            self.error += u"ERROR: RequestException in scrape_for_fulltext_link"
            logger.info(self.error)
            return
        except requests.exceptions.ChunkedEncodingError as e:
            self.error += u"ERROR: ChunkedEncodingError error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except NoDoiException as e:
            self.error += u"ERROR: NoDoiException error on {} in scrape_for_fulltext_link: {}".format(url, unicode(e.message).encode("utf-8"))
            logger.info(self.error)
            return
        except Exception as e:
            self.error += u"ERROR: Exception error on in scrape_for_fulltext_link"
            logger.exception(self.error)
            return

        if DEBUG_SCRAPING:
            logger.info(u"found no PDF download link.  end of the line. [{}]".format(url))

        return self


def accept_direct_pdf_links(url):
    if re.match(ur'^https?://pure\.mpg\.de', url):
        # direct pdf lnks to supplementary materials
        return False

    return True

class PmhRepoWebpage(RepoWebpage):
    @property
    def base_open_version_source_string(self):
        if self.match_type:
            return u"oa repository (via OAI-PMH {} match)".format(self.match_type)
        return u"oa repository (via OAI-PMH)"


def find_doc_download_link(page):
    for link in get_useful_links(page):
        # there are some links that are FOR SURE not the download for this article
        if has_bad_href_word(link.href):
            continue

        if has_bad_anchor_word(link.anchor):
            continue

        # = open repo https://lirias.kuleuven.be/handle/123456789/372010
        if ".doc" in link.href or ".doc" in link.anchor:
            if DEBUG_SCRAPING:
                logger.info(u"link details: {} {}".format(link.href, link.anchor))
            return link

    return None


def find_bhl_view_link(url, page_content):
    hostname = urlparse(url).hostname
    if not (hostname and hostname.endswith(u'biodiversitylibrary.org')):
        return None

    view_links = [link for link in get_useful_links(page_content) if link.anchor == 'view article']
    return view_links[0] if view_links else None


class DuckLink(object):
    def __init__(self, href, anchor):
        self.href = href
        self.anchor = anchor


def get_useful_links(page):
    links = []

    tree = get_tree(page)
    if tree is None:
        return []

    # remove related content sections

    bad_section_finders = [
        # references and related content sections

        "//div[@class=\'relatedItem\']",  #http://www.tandfonline.com/doi/abs/10.4161/auto.19496
        "//div[@class=\'citedBySection\']",  #10.3171/jns.1966.25.4.0458
        "//div[@class=\'references\']",  #https://www.emeraldinsight.com/doi/full/10.1108/IJCCSM-04-2017-0089
        "//div[@class=\'moduletable\']",  # http://vestnik.mrsu.ru/index.php/en/articles2-en/80-19-1/671-10-15507-0236-2910-029-201901-1
        "//div[contains(@class, 'ref-list')]", #https://www.jpmph.org/journal/view.php?doi=10.3961/jpmph.16.069
        "//div[@id=\'supplementary-material\']", #https://www.jpmph.org/journal/view.php?doi=10.3961/jpmph.16.069
        "//div[@id=\'toc\']",  # https://www.elgaronline.com/view/edcoll/9781781004326/9781781004326.xml
        "//div[contains(@class, 'cta-guide-authors')]",  # https://www.journals.elsevier.com/physics-of-the-dark-universe/
        "//div[contains(@class, 'footer-publication')]",  # https://www.journals.elsevier.com/physics-of-the-dark-universe/
        "//d-appendix",  # https://distill.pub/2017/aia/
        "//dt-appendix",  # https://distill.pub/2016/handwriting/
        "//div[starts-with(@id, 'dt-cite')]",  # https://distill.pub/2017/momentum/
        "//ol[contains(@class, 'ref-item')]",  # http://www.cjcrcn.org/article/html_9778.html
        "//div[contains(@class, 'NLM_back')]",      # https://pubs.acs.org/doi/10.1021/acs.est.7b05624
        "//div[contains(@class, 'NLM_citation')]",  # https://pubs.acs.org/doi/10.1021/acs.est.7b05624
        "//div[@id=\'relatedcontent\']",            # https://pubs.acs.org/doi/10.1021/acs.est.7b05624
        "//div[@id=\'author-infos\']",  # https://www.tandfonline.com/doi/full/10.1080/01639374.2019.1670767
        "//ul[@id=\'book-metrics\']",   # https://link.springer.com/book/10.1007%2F978-3-319-63811-9
        "//section[@id=\'article_references\']",   # https://www.nejm.org/doi/10.1056/NEJMms1702111
        "//section[@id=\'SupplementaryMaterial\']",   # https://link.springer.com/article/10.1057%2Fs41267-018-0191-3
        "//div[@id=\'attach_additional_files\']",   # https://digitalcommons.georgiasouthern.edu/ij-sotl/vol5/iss2/14/
        "//span[contains(@class, 'fa-lock')]",  # https://www.dora.lib4ri.ch/eawag/islandora/object/eawag%3A15303
        "//ul[@id=\'reflist\']",  # https://elibrary.steiner-verlag.de/article/10.25162/sprib-2019-0002
        "//div[@class=\'listbibl\']",  # http://sk.sagepub.com/reference/the-sage-handbook-of-television-studies
        "//div[contains(@class, 'summation-section')]",  # https://www.tandfonline.com/eprint/EHX2T4QAGTIYVPK7MJBF/full?target=10.1080/20507828.2019.1614768
        "//ul[contains(@class, 'references')]",  # https://www.tandfonline.com/eprint/EHX2T4QAGTIYVPK7MJBF/full?target=10.1080/20507828.2019.1614768
        "//p[text()='References']/following-sibling::p", # http://researcherslinks.com/current-issues/Effect-of-Different-Temperatures-on-Colony/20/1/2208/html
        "//span[contains(@class, 'ref-lnk')]",  # https://www.tandfonline.com/doi/full/10.1080/19386389.2017.1285143
        "//div[@id=\'referenceContainer\']",  # https://www.jbe-platform.com/content/journals/10.1075/ld.00050.kra
        "//div[contains(@class, 'table-of-content')]",  # https://onlinelibrary.wiley.com/doi/book/10.1002/9781118897126
        "//img[contains(@src, 'supplementary_material')]/following-sibling::p", # https://pure.mpg.de/pubman/faces/ViewItemOverviewPage.jsp?itemId=item_2171702

        # can't tell what chapter/section goes with what doi
        "//div[@id=\'booktoc\']",  # https://link.springer.com/book/10.1007%2F978-3-319-63811-9
        "//div[@id=\'tocWrapper\']",  # https://www.elgaronline.com/view/edcoll/9781786431417/9781786431417.xml
    ]

    for section_finder in bad_section_finders:
        for bad_section in tree.xpath(section_finder):
            bad_section.clear()

    # now get the links
    link_elements = tree.xpath("//a")

    for link in link_elements:
        link_text = link.text_content().strip().lower()
        if link_text:
            link.anchor = link_text
            if "href" in link.attrib:
                link.href = link.attrib["href"]
        elif u'title' in link.attrib and u'download fulltext' in link.attrib[u'title'].lower():
            link.anchor = u'title: {}'.format(link.attrib[u'title'])
            if u'href' in link.attrib:
                link.href = link.attrib[u'href']
        else:
            # also a useful link if it has a solo image in it, and that image includes "pdf" in its filename
            link_content_elements = [l for l in link]
            if len(link_content_elements)==1:
                link_insides = link_content_elements[0]
                if link_insides.tag=="img":
                    if "src" in link_insides.attrib and "pdf" in link_insides.attrib["src"]:
                        link.anchor = u"image: {}".format(link_insides.attrib["src"])
                        if "href" in link.attrib:
                            link.href = link.attrib["href"]

        if hasattr(link, "anchor") and hasattr(link, "href"):
            links.append(link)

    return links


def is_purchase_link(link):
    # = closed journal http://www.sciencedirect.com/science/article/pii/S0147651300920050
    if "purchase" in link.anchor:
        logger.info(u"found a purchase link! {} {}".format(link.anchor, link.href))
        return True
    return False


def has_bad_href_word(href):
    href_blacklist = [
        # = closed 10.1021/acs.jafc.6b02480
        # editorial and advisory board
        "/eab/",

        # = closed 10.1021/acs.jafc.6b02480
        "/suppl_file/",

        # https://lirias.kuleuven.be/handle/123456789/372010
        "supplementary+file",

        # http://www.jstor.org/action/showSubscriptions
        "showsubscriptions",

        # 10.7763/ijiet.2014.v4.396
        "/faq",

        # 10.1515/fabl.1988.29.1.21
        "{{",

        # 10.2174/1389450116666150126111055
        "cdt-flyer",

        # 10.1111/fpa.12048
        "figures",

        # https://www.crossref.org/iPage?doi=10.3138%2Fecf.22.1.1
        "price-lists",

        # https://aaltodoc.aalto.fi/handle/123456789/30772
        "aaltodoc_pdf_a.pdf",

        # prescribing information, see http://www.nejm.org/doi/ref/10.1056/NEJMoa1509388#t=references
        "janssenmd.com",

        # prescribing information, see http://www.nejm.org/doi/ref/10.1056/NEJMoa1509388#t=references
        "community-register",

        # prescribing information, see http://www.nejm.org/doi/ref/10.1056/NEJMoa1509388#t=references
        "quickreference",

        # 10.4158/ep.14.4.458
        "libraryrequestform",

        # http://www.nature.com/nutd/journal/v6/n7/full/nutd201620a.html
        "iporeport",

        #https://ora.ox.ac.uk/objects/uuid:06829078-f55c-4b8e-8a34-f60489041e2a
        "no_local_copy",

        ".zip",

        # https://zenodo.org/record/1238858
        ".gz",

        # https://zenodo.org/record/1238858
        ".tar.",

        # http://www.bioone.org/doi/full/10.1642/AUK-18-8.1
        "/doi/full/10.1642",

        # dating site :(  10.1137/S0036142902418680 http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.144.7627
        "hyke.org",

        # is a citation http://orbit.dtu.dk/en/publications/autonomous-multisensor-microsystem-for-measurement-of-ocean-water-salinity(1dea807b-c309-40fd-a623-b6c28999f74f).html
        "&rendering=",

        ".fmatter",

        "/samples/",

        # http://ira.lib.polyu.edu.hk/handle/10397/78907
        "letter_to_publisher",

        # https://www.sciencedirect.com/science/article/abs/pii/S1428226796700911?via%3Dihub
        'first-page',

        # https://www.mitpressjournals.org/doi/abs/10.1162/evco_a_00219
        'lib_rec_form',

        # http://www.eurekaselect.com/107875/chapter/climate-change-and-snow-cover-in-the-european-alp
        'ebook-flyer',

        # http://digital.csic.es/handle/10261/134122
        'accesoRestringido',

        # https://www.springer.com/statistics/journal/11222
        '/productFlyer/',

        # https://touroscholar.touro.edu/nymc_fac_pubs/622/
        '/author_agreement',

        # http://orca.cf.ac.uk/115888/
        'supinfo.pdf',

        # http://orca.cf.ac.uk/619/
        '/Appendix',

        # https://digitalcommons.fairfield.edu/business-facultypubs/31/
        'content_policy.pdf',

        # http://cds.cern.ch/record/1338672
        'BookTOC.pdf',
        'BookBackMatter.pdf',

        # https://www.goodfellowpublishers.com/academic-publishing.php?content=doi&doi=10.23912/9781911396512-3599
        'publishers-catalogue',

        # https://orbi.uliege.be/handle/2268/212705
        "_toc_",

        # https://pubs.usgs.gov/of/2004/1004/
        "adobe.com/products/acrobat",

        # https://physics.aps.org/articles/v13/31
        "featured-article-pdf",

        # http://www.jstor.org.libezproxy.open.ac.uk/stable/1446650
        "modern-slavery-act-statement.pdf",

        # https://pearl.plymouth.ac.uk/handle/10026.1/15597
        "Deposit_Agreement",

        # https://www.e-elgar.com/shop/gbp/the-elgar-companion-to-social-economics-second-edition-9781783478538.html
        '/product_flyer/',

        # https://journals.lww.com/jbjsjournal/FullText/2020/05200/Better_Late_Than_Never,_but_Is_Early_Best__.15.aspx
        'links.lww.com/JBJS/F791',

        # https://ctr.utpjournals.press/doi/10.3138/ctr.171.005
        'ctr_media_kit',
        'ctr_advertising_rates',
    ]

    href_whitelist = [
        # https://zenodo.org/record/3831263
        '190317_MainText_Figures_JNNP.pdf',
    ]

    for good_word in href_whitelist:
        if good_word.lower() in href.lower():
            return False

    for bad_word in href_blacklist:
        if bad_word.lower() in href.lower():
            return True

    return False


def has_bad_anchor_word(anchor_text):
    anchor_blacklist = [
        # = closed repo https://works.bepress.com/ethan_white/27/
        "user",
        "guide",

        # = closed 10.1038/ncb3399
        "checklist",

        # wrong link
        "abstracts",

        # http://orbit.dtu.dk/en/publications/autonomous-multisensor-microsystem-for-measurement-of-ocean-water-salinity(1dea807b-c309-40fd-a623-b6c28999f74f).html
        "downloaded publications",

        # https://hal.archives-ouvertes.fr/hal-00085700
        "metadata from the pdf file",
        u"récupérer les métadonnées à partir d'un fichier pdf",

        # = closed http://europepmc.org/abstract/med/18998885
        "bulk downloads",

        # http://www.utpjournals.press/doi/pdf/10.3138/utq.35.1.47
        "license agreement",

        # = closed 10.1021/acs.jafc.6b02480
        "masthead",

        # closed http://eprints.soton.ac.uk/342694/
        "download statistics",

        # no examples for these yet
        "supplement",
        "figure",
        "faq",

        # https://www.biodiversitylibrary.org/bibliography/829
        "download MODS",
        "BibTeX citations",
        "RIS citations",

        'ACS ActiveView PDF',

        # https://doi.org/10.11607/jomi.4336
        'Submission Form',

        # https://doi.org/10.1117/3.651915
        'Sample Pages',

        # https://babel.hathitrust.org/cgi/pt?id=uc1.e0000431916&view=1up&seq=24
        'Download this page',
        'Download left page',
        'Download right page',

        # https://touroscholar.touro.edu/nymc_fac_pubs/622/
        'author agreement',

        # https://www.longwoods.com/content/25849
        'map to our office',

        # https://www.e-elgar.com/shop/the-art-of-mooting
        'download flyer',

        # https://www.nowpublishers.com/article/Details/ENT-062
        'download extract',

        # https://utpjournals.press/doi/full/10.3138/jsp.48.3.137
        'Call for Papers',

        # https://brill.com/view/title/14711
        'View PDF Flyer',
    ]
    for bad_word in anchor_blacklist:
        if bad_word.lower() in anchor_text.lower():
            return True

    return False


def get_pdf_in_meta(page):
    if "citation_pdf_url" in page:
        if DEBUG_SCRAPING:
            logger.info(u"citation_pdf_url in page")

        tree = get_tree(page)
        if tree is not None:
            metas = tree.xpath("//meta")
            for meta in metas:
                if "name" in meta.attrib:
                    if meta.attrib["name"] == "citation_pdf_url":
                        if "content" in meta.attrib:
                            link = DuckLink(href=meta.attrib["content"], anchor="<meta citation_pdf_url>")
                            return _transform_meta_pdf(link, page)
        else:
            # backup if tree fails
            regex = r'<meta name="citation_pdf_url" content="(.*?)">'
            matches = re.findall(regex, page)
            if matches:
                link = DuckLink(href=matches[0], anchor="<meta citation_pdf_url>")
                return _transform_meta_pdf(link, page)
    return None


def _transform_meta_pdf(link, page):
    if link and link.href:
        link.href = re.sub('(https?://[\w\.]*onlinelibrary.wiley.com/doi/)pdf(/.+)', r'\1pdfdirect\2', link.href)
        link.href = re.sub('(^https?://drops\.dagstuhl\.de/.*\.pdf)/$', r'\1', link.href)

        # preview PDF
        nature_pdf = re.match(ur'^https?://www\.nature\.com(/articles/[a-z0-9-]*.pdf)', link.href)
        if nature_pdf:
            reference_pdf = re.sub(ur'\.pdf$', '_reference.pdf',  nature_pdf.group(1))
            if reference_pdf in page:
                link.href = reference_pdf

    return link


def get_pdf_from_javascript(page):
    matches = re.findall('"pdfUrl":"(.*?)"', page)
    if matches:
        link = DuckLink(href=matches[0], anchor="pdfUrl")
        return link
    return None


def _discard_pdf_url(url):
    # count the landing page as an OA location but don't use the PDF URL

    parsed_url = urlparse(url)

    # PDF URLs work but aren't stable
    if parsed_url.hostname and parsed_url.hostname.endswith('exlibrisgroup.com') \
            and parsed_url.query and 'Expires=' in parsed_url.query:
        return True

    return False