python source code of PaperScraper

"""
.. module:: paperscraper
   :synopsis: A class to facilitate the management of multiple scrapers for journal aggregators and publishing sites

.. moduleauthor:: Andriy Mulyar <contact@andriymulyar.com>
"""
from paperscraper.aggregators.doi_aggregator import DOIAggregator
from .aggregators.pubmed_aggregator import PubMedAggregator
from .scrapers.science_direct_scraper import ScienceDirect
from .scrapers.acs_scraper import ACS
from .scrapers.pmc_scraper import PMC
from .scrapers.rsc_scraper import RSC
from selenium import webdriver
import pkg_resources


class PaperScraper():
    """This class provides a direct interface to the inner functionality of paperscraper

    .. note::

       This is the only class needed to utilize all features of paperscraper.

    """

    def __init__(self, webdriver_path=None):
        """Creates a PaperScraper object

        Initializes a PaperScraper that can scrape text and meta-data from scientific journals. Individual journal
        scrapers and journal link aggregators are implemented in :mod:'scrapers' and :mod:'aggregators'.

        :param webdriver_path: The file path of a custom web driver to utilize, defaults to utilize the chromedriver
            that comes installed with the package.
        :type webdriver_path: str.

        """
        options = webdriver.ChromeOptions()
        options.add_argument('headless')

        webdriver_path = pkg_resources.resource_filename('paperscraper', 'webdrivers/chromedriver')

        if ('webdriver_path' is not None):
            self.webdriver_path = webdriver_path

        self.driver = webdriver.Chrome(webdriver_path, options=options)

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.driver.quit()

    def __import_all_scrapers(self):
        return [ScienceDirect(self.driver), ACS(self.driver), PMC(self.driver), RSC(self.driver)]

    def get_scrapable_websites(self):
        """
        Retrieves all journal urls that are available to be scraped.
        :return:
        """
        return [website for scraper in self.__import_all_scrapers() for website in scraper.website]

    def is_scrapable(self, url):
        """
        Checks if a given url is scrapable by PaperScraper

        :param url: a url containing a full text that needs scrapping
        :return: the Scraper that can scrape url or None
        """

        for website_scraper in self.__import_all_scrapers():
            if (website_scraper.is_correct_url(url)):
                return website_scraper
        return None

    def extract_from_url(self, url):
        """
           Return a JSON file containing a the full text and meta data of the paper located at 'url'.
           Returns None if 'url' cannot be scraped.
           """
        for website_scraper in self.__import_all_scrapers():
            if (website_scraper.is_correct_url(url)):
                return website_scraper.extract(url)
        return None

    def extract_from_pmid(self, pmid):
        """
        Attempts to retrieve a paper given its PMID

        :param pmid:
        :return: An OrderedDict object containing the extracted paper
        """
        pm = PubMedAggregator(self.driver)
        all_sites = pm.extract(pmid)
        for url in [all_sites.get(key)['href'] for key in all_sites.keys()]:
            website_scraper = self.is_scrapable(url)
            if website_scraper is not None:
                return self.extract_from_url(url)

        return None

    def get_sites_from_pmid(self, pmid):
        pm = PubMedAggregator(self.driver)
        all_sites = pm.extract(pmid, follow_link=True)
        return [all_sites.get(key)['href'] for key in all_sites.keys()]

    def get_sites_from_doi(self, doi_num):
        dm = DOIAggregator()
        extracted_site = dm.extract(doi_num)
        return [extracted_site]