python source code of scrape

#
# Copyright 2019 Micah Cochran
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# internal libs
import datetime
from pathlib import Path
import sys
# for mypy
from typing import Callable, Dict, IO, List, Optional, Tuple, Union

# external libs
import extruct
import isodate
import requests
import validators


_PACKAGE_PATH = Path(__file__).resolve().parent

# read version from VERSION file
__version__ = (_PACKAGE_PATH / 'VERSION').read_text().strip()


# Follow RFC 7231 sec. 5.5.3
USER_AGENT_STR = 'scrape-schema-recipe/{} requests/{}'.format(
                                            __version__, requests.__version__)


def scrape(location: Union[str, IO[str]],
           python_objects: Union[bool, List, Tuple] = False,
           nonstandard_attrs: bool = False, migrate_old_schema: bool = True,
           user_agent_str: Optional[str] = None) -> List[Dict]:
    """
    Parse data in https://schema.org/Recipe format into a list of dictionaries
    representing the recipe data.

    Parameters
    ----------
    location : string or file-like object
        A url, filename, or text_string of HTML, or a file-like object.

    python_objects : bool, list, tuple  (optional)
        when True it translates certain data types into python objects
          dates into datetime.date, datetimes into datetime.datetimes,
          durations as dateime.timedelta.
        when set to a list or tuple only converts types specified to
          python objects:
          when set to either [dateime.date] or [datetime.datetimes] either will
            convert dates.
          when set to [datetime.timedelta] durations will be converted
        when False no conversion is performed
        (defaults to False)

    nonstandard_attrs : bool, optional
        when True it adds nonstandard (for schema.org/Recipe) attributes to the
        resulting dictionaries, that are outside the specification such as:
            '_format' is either 'json-ld' or 'microdata' (how schema.org/Recipe was encoded into HTML)
            '_source_url' is the source url, when 'url' has already been defined as another value
        (defaults to False)

    migrate_old_schema : bool, optional
        when True it migrates the schema from older version to current version
        (defaults to True)

    user_agent_str : string, optional
        overide the user_agent_string with this value.
        (defaults to None)

    Returns
    -------
    list
        a list of dictionaries in the style of schema.org/Recipe JSON-LD
        no results - an empty list will be returned
    """

    data = {}  # type: Dict[str, List[Dict]]

    if not user_agent_str:
        user_agent_str = USER_AGENT_STR

    # make sure that one and only are defined
    url = None
    if isinstance(location, str):
        # Is this a url?
        if validators.url(location):
            return scrape_url(location, python_objects=python_objects,
                              nonstandard_attrs=nonstandard_attrs,
                              user_agent_str=user_agent_str)

        # Is this is is a very long string? Perhaps it has HTML content.
        elif len(location) > 255:
            data = extruct.extract(location)

        # Maybe it is a filename?
        else:
            with open(location) as f:
                data = extruct.extract(f.read())
    elif hasattr(location, 'read'):
        # Assume this is some kind of file-like object that can be read.
        data = extruct.extract(location.read())
    else:
        raise TypeError(
                  'location type "{}" is not a string for a url, filename, or '
                  'text_string of the HTML, or a file-like object.'.format(
                      type(location)))

    scrapings = _convert_to_scrapings(data, nonstandard_attrs, url=url)

    if migrate_old_schema is True:
        scrapings = _migrate_old_schema(scrapings)

    if python_objects is not False:
        scrapings = _pythonize_objects(scrapings, python_objects)

    return scrapings


def load(fp: Union[str, IO[str], Path],
         python_objects: Union[bool, List, Tuple] = False,
         nonstandard_attrs: bool = False,
         migrate_old_schema: bool = True) -> List[Dict]:
    """load a filename or file object to scrape

    Parameters
    ----------
    fp : string or file-like object
        A file name or a file-like object.

    python_objects : bool, list, tuple  (optional)
        when True it translates certain data types into python objects
          dates into datetime.date, datetimes into datetime.datetimes,
          durations as dateime.timedelta.
        when set to a list or tuple only converts types specified to
          python objects:
          when set to either [dateime.date] or [datetime.datetimes] either will
            convert dates.
          when set to [datetime.timedelta] durations will be converted
        when False no conversion is performed
        (defaults to False)

    nonstandard_attrs : bool, optional
        when True it adds nonstandard (for schema.org/Recipe) attributes to the
        resulting dictionaries, that are outside the specification such as:
            '_format' is either 'json-ld' or 'microdata' (how schema.org/Recipe was encoded into HTML)
            '_source_url' is the source url, when 'url' has already been defined as another value
        (defaults to False)

    migrate_old_schema : bool, optional
        when True it migrates the schema from older version to current version
        (defaults to True)

    Returns
    -------
    list
        a list of dictionaries in the style of schema.org/Recipe JSON-LD
        no results - an empty list will be returned

    """

    data = {}  # type: Dict[str, List[Dict]]

    if isinstance(fp, str):
        with open(fp) as f:
            data = extruct.extract(f.read())
    elif isinstance(fp, Path):
        data = extruct.extract(fp.read_text())
    elif hasattr(fp, 'read'):
        # Assume this is some kind of file-like object that can be read.
        data = extruct.extract(fp.read())
    else:
        err_msg = 'expected, fp to be a filename, pathlib.Path object, ' \
            'or a file-like object, fp is of type {}'.format(type(fp))
        raise TypeError(err_msg)

    scrapings = _convert_to_scrapings(data, nonstandard_attrs)

    if migrate_old_schema is True:
        scrapings = _migrate_old_schema(scrapings)

    if python_objects is not False:
        scrapings = _pythonize_objects(scrapings, python_objects)

    return scrapings


def loads(string: str, python_objects: Union[bool, List, Tuple] = False,
          nonstandard_attrs: bool = False,
          migrate_old_schema: bool = True) -> List[Dict]:
    """scrapes a string

    Parameters
    ----------
    string : string
        A text string of HTML.

    python_objects : bool, list, tuple  (optional)
        when True it translates certain data types into python objects
          dates into datetime.date, datetimes into datetime.datetimes,
          durations as dateime.timedelta.
        when set to a list or tuple only converts types specified to
          python objects:
          when set to either [dateime.date] or [datetime.datetimes] either will
            convert dates.
          when set to [datetime.timedelta] durations will be converted
        when False no conversion is performed
        (defaults to False)

    nonstandard_attrs : bool, optional
        when True it adds nonstandard (for schema.org/Recipe) attributes to the
        resulting dictionaries, that are outside the specification such as:
            '_format' is either 'json-ld' or 'microdata' (how schema.org/Recipe was encoded into HTML)
            '_source_url' is the source url, when 'url' has already been defined as another value
        (defaults to False)

    migrate_old_schema : bool, optional
        when True it migrates the schema from older version to current version
        (defaults to True)

    Returns
    -------
    list
        a list of dictionaries in the style of schema.org/Recipe JSON-LD
        no results - an empty list will be returned

    """

    if not isinstance(string, str):
        raise TypeError('string is type "{}", a string was expected'
                        ''.format(type(string)))

    data = {}  # type: Dict[str, List[Dict]]
    data = extruct.extract(string)
    scrapings = _convert_to_scrapings(data, nonstandard_attrs)

    if migrate_old_schema is True:
        scrapings = _migrate_old_schema(scrapings)

    if python_objects is not False:
        scrapings = _pythonize_objects(scrapings, python_objects)

    return scrapings


def scrape_url(url: str, python_objects: Union[bool, List, Tuple] = False,
               nonstandard_attrs: bool = False,
               migrate_old_schema: bool = True,
               user_agent_str: str = None) -> List[Dict]:
    """scrape from a URL

    Parameters
    ----------
    url : string
        A url to download data from and scrape.

    python_objects : bool, list, tuple  (optional)
        when True it translates certain data types into python objects
          dates into datetime.date, datetimes into datetime.datetimes,
          durations as dateime.timedelta.
        when set to a list or tuple only converts types specified to
          python objects:
          when set to either [dateime.date] or [datetime.datetimes] either will
            convert dates.
          when set to [datetime.timedelta] durations will be converted
        when False no conversion is performed
        (defaults to False)

    nonstandard_attrs : bool, optional
        when True it adds nonstandard (for schema.org/Recipe) attributes to the
        resulting dictionaries, that are outside the specification such as:
            '_format' is either 'json-ld' or 'microdata' (how schema.org/Recipe was encoded into HTML)
            '_source_url' is the source url, when 'url' has already been defined as another value
        (defaults to False)

    migrate_old_schema : bool, optional
        when True it migrates the schema from older version to current version
        (defaults to True)

    user_agent_str : string, optional
        overide the user_agent_string with this value.
        (defaults to None)

    Returns
    -------
    list
        a list of dictionaries in the style of schema.org/Recipe JSON-LD
        no results - an empty list will be returned


    """

    if not isinstance(url, str):
        raise TypeError('url is type "{}", a string was expected'
                        ''.format(type(url)))

    data = {}  # type: Dict[str, List[Dict]]
    if not user_agent_str:
        user_agent_str = USER_AGENT_STR

    r = requests.get(url, headers={'User-Agent': user_agent_str})
    r.raise_for_status()
    data = extruct.extract(r.text, r.url)
    url = r.url

    scrapings = _convert_to_scrapings(data, nonstandard_attrs, url=url)

    if migrate_old_schema is True:
        scrapings = _migrate_old_schema(scrapings)

    if python_objects is not False:
        scrapings = _pythonize_objects(scrapings, python_objects)

    return scrapings


def _convert_json_ld_recipe(rec: Dict,
                            nonstandard_attrs: bool = False,
                            url: str = None) -> Dict:
    """Helper function for _convert_to_scraping
    for a json-ld record adding extra tags"""
    # not sure if a copy is necessary?
    d = rec.copy()
    if nonstandard_attrs is True:
        d['_format'] = 'json-ld'
    # store the url
    if url:
        if d.get('url') and d.get('url') != url and nonstandard_attrs is True:
            d['_source_url'] = url
        else:
            d['url'] = url
    return d


def _convert_to_scrapings(data: Dict[str, List[Dict]],
                          nonstandard_attrs: bool = False,
                          url: str = None) -> List[Dict]:
    """dectects schema.org/Recipe content and extracts it"""
    out = []
    if data['json-ld'] != []:
        for rec in data['json-ld']:
            if rec.get('@type') == 'Recipe':
                d = _convert_json_ld_recipe(rec, nonstandard_attrs, url)
                out.append(d)

            if rec.get('@context') == 'https://schema.org' and '@graph' in rec.keys():
                # walk the graph
                for subrec in rec['@graph']:
                    if subrec['@type'] == 'Recipe':
                        d = _convert_json_ld_recipe(subrec, nonstandard_attrs, url)
                        out.append(d)

    if data['microdata'] != []:
        for rec in data['microdata']:
            if rec['type'] in ('http://schema.org/Recipe',
                               'https://schema.org/Recipe'):
                d = rec['properties'].copy()
                if nonstandard_attrs is True:
                    d['_format'] = 'microdata'
                # add @context and @type for conversion to the JSON-LD
                # style format
                if rec['type'][:6] == 'https:':
                    d['@context'] = 'https://schema.org'
                else:
                    d['@context'] = 'http://schema.org'
                d['@type'] = 'Recipe'

                # store the url
                if url:
                    if d.get('url') and nonstandard_attrs is True:
                        d['_source_url'] = url
                    else:
                        d['url'] = url

                for key in d.keys():
                    if isinstance(d[key], dict) and 'type' in d[key]:
                        type_ = d[key].pop('type')
                        d[key]['@type'] = type_.split('/')[3]

                out.append(d)

    return out


# properties that will be passed into datetime objects
DATETIME_PROPERTIES = frozenset(['dateCreated', 'dateModified',
                                 'datePublished', 'expires'])
DURATION_PROPERTIES = frozenset(['cookTime', 'performTime', 'prepTime',
                                 'totalTime', 'timeRequired'])


def _parse_determine_date_datetime(s: str) -> Union[datetime.datetime,
                                                    datetime.date]:
    """Parse function parses a date, if time is included it parses as a
    datetime.
    """
    if sys.version_info >= (3, 7):
        # Check if the date includes time.
        if 'T' in s:
            return datetime.datetime.fromisoformat(s)
        else:
            return datetime.date.fromisoformat(s)
    else:
        # Check if the date includes time.
        if 'T' in s:
            return isodate.parse_datetime(s)
        else:
            return isodate.parse_date(s)


# Test if lists/tuples have contain matching items
def _have_matching_items(lst1: Union[bool, List, Tuple],
                         lst2: Union[bool, List, Tuple]):
    if isinstance(lst1, bool):
        return lst1

    if isinstance(lst2, bool):
        return lst2

    s = set(lst1).intersection(lst2)
    return len(s) > 0


def _pythonize_objects(scrapings: List[Dict], python_objects: Union[bool,
                       List, Tuple]) -> List[Dict]:

    if python_objects is False:
        # this really should not be happening
        return scrapings

    # this should work, mypy gives error, this isn't bulletproof code
    if python_objects is True or datetime.timedelta in python_objects:  # type: ignore
        # convert ISO 8601 date times into timedelta
        scrapings = _convert_properties_scrape(scrapings, DURATION_PROPERTIES,
                                               isodate.parse_duration)

    if python_objects is True or _have_matching_items((datetime.date, datetime.datetime), python_objects):
        # convert ISO 8601 date times into datetimes.datetime objects
        scrapings = _convert_properties_scrape(scrapings, DATETIME_PROPERTIES,
                                               _parse_determine_date_datetime)

    return scrapings


def _convert_properties_scrape(recipes: List[Dict], properties: frozenset,
                               function: Callable[[str], Union[datetime.datetime, datetime.date]]) -> List[Dict]:
    for i in range(len(recipes)):
        key_set = set(recipes[i].keys())
        for p in key_set.intersection(properties):
                try:
                    recipes[i][p] = function(recipes[i][p])
                except (isodate.ISO8601Error, ValueError):
                    # parse error, just leave the value as is
                    pass

    return recipes


def _migrate_old_schema(recipes: List[Dict]) -> List[Dict]:
    """Migrate old schema.org/Recipe version to current schema version."""
    for i in range(len(recipes)):
        # rename 'ingredients' to 'recipeIngredient'
        if 'ingredients' in recipes[i]:
            recipes[i]['recipeIngredient'] = recipes[i].pop('ingredients')

    return recipes