"""
Middleware for implementing visited pages storage using hubstorage
"""
import logging
import os

from scrapinghub.hubstorage import ValueTooLarge
from scrapinghub.hubstorage.utils import urlpathjoin
from scrapy.exceptions import NotConfigured, IgnoreRequest
from scrapy.utils.request import request_fingerprint
from scrapy.http import TextResponse
from scrapy import signals
from scrapy.item import DictItem, Field

try:
    from cgi import parse_qsl
except ImportError:
    from urllib.parse import parse_qsl

logger = logging.getLogger(__name__)
_COLLECTION_NAME = "Pages"


class PageStorageMiddleware:

    @classmethod
    def from_crawler(cls, crawler):
        enabled, on_error_enabled = _get_enabled_status(crawler.settings)
        if enabled or on_error_enabled:
            return cls(crawler)
        raise NotConfigured

    def __init__(self, crawler):
        # FIXME move sh_scrapy.hsref to python-hubstorage and drop it
        try:
            from sh_scrapy.hsref import hsref
            self.hsref = hsref
        except ImportError:
            raise NotConfigured
        settings = crawler.settings
        mode = 'cs'
        if settings.get('PAGE_STORAGE_MODE') == 'VERSIONED_CACHE':
            mode = 'vcs'
        self.trim_html = False
        if settings.getbool('PAGE_STORAGE_TRIM_HTML'):
            self.trim_html = True
        self.enabled, self.on_error_enabled = _get_enabled_status(settings)
        self.limits = {
            'all': crawler.settings.getint('PAGE_STORAGE_LIMIT'),
            'error': crawler.settings.getint('PAGE_STORAGE_ON_ERROR_LIMIT'),
        }
        self.counters = {
            'all': 0,
            'error': 0,
        }
        self.cookies_seen = set()
        endpoint = urlpathjoin(hsref.project.collections.url,
                               mode, _COLLECTION_NAME)
        logger.info("HubStorage: writing pages to %s", endpoint)
        hsref.job.metadata.apipost('collection',
                                   jl=urlpathjoin(mode, _COLLECTION_NAME))
        self._writer = hsref.client.batchuploader.create_writer(
            endpoint, content_encoding='gzip', size=20)
        crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)

    def spider_closed(self, spider):
        self._writer.close()

    def process_spider_input(self, response, spider):
        if self.enabled and (self.counters['all'] < self.limits['all']):
            self.counters['all'] += 1
            self.save_response(response, spider)

    def process_spider_exception(self, response, exception, spider):
        if (self.on_error_enabled and
                not isinstance(exception, IgnoreRequest) and
                self.counters['error'] < self.limits['error']):
            self.counters['error'] += 1
            self.save_response(response, spider)

    def save_response(self, response, spider):
        if isinstance(response, TextResponse):
            fp = request_fingerprint(response.request)
            payload = {
                "_key": fp,
                "_jobid": self.hsref.job.key,
                "_type": "_pageitem",
                "_encoding": response.encoding,
                "url": response.url,
            }
            self._set_cookies(payload, response)

            if response.request.method == 'POST':
                payload["postdata"] = dict(parse_qsl(response.request.body.decode()))

            payload["body"] = response.body_as_unicode()
            if self.trim_html:
                payload['body'] = payload['body'].strip(' \r\n\0')

            if len(payload['body']) > self._writer.maxitemsize:
                spider.logger.warning("Page not saved, body too large: <%s>" %
                                      response.url)
                return

            try:
                self._writer.write(payload)
            except ValueTooLarge as exc:
                spider.logger.warning("Page not saved, %s: <%s>" %
                                      (exc, response.url))

    def process_spider_output(self, response, result, spider):
        fp = request_fingerprint(response.request)
        try:
            for r in result:
                if isinstance(r, DictItem):
                    r.fields["_cached_page_id"] = Field()
                    r._values["_cached_page_id"] = fp
                elif isinstance(r, dict):
                    r["_cached_page_id"] = fp
                yield r
        except Exception as exc:
            self.process_spider_exception(response, exc, spider)
            raise

    def _set_cookies(self, payload, response):
        cookies = []
        for cookie in [x.split(b';', 1)[0].decode('ISO-8859-1')
                       for x in response.headers.getlist('Set-Cookie')]:
            if cookie not in self.cookies_seen:
                self.cookies_seen.add(cookie)
                cookies.append(cookie)
        if cookies:
            payload["cookies"] = cookies


def _get_enabled_status(settings):
    enabled = settings.getbool('PAGE_STORAGE_ENABLED')
    autospider = (os.environ.get('SHUB_SPIDER_TYPE') in ('auto', 'portia'))
    on_error_enabled = settings.getbool('PAGE_STORAGE_ON_ERROR_ENABLED')
    return (enabled or autospider), on_error_enabled