Python scrapy.exceptions.IgnoreRequest() Examples

The following are 30 code examples of scrapy.exceptions.IgnoreRequest(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.exceptions , or try the search function .
Example #1
Source File: middlewares.py    From freshonions-torscraper with GNU Affero General Public License v3.0 6 votes vote down vote up
def process_request(self, request, spider): 
        parsed_url = urlparse.urlparse(request.url)
        
        if not self.test_mode or not parsed_url.path in ["/", ""]:
            return None

        if not Domain.is_onion_url(request.url):
            return None

        d = Domain.find_by_url(request.url)

        if d is None:
            return None

        now = datetime.now()

        if now > d.next_scheduled_check:
            return None
        else:
            raise IgnoreRequest('FilterNotScheduledMiddleware: %s is not scheduled to check' % d.host) 
Example #2
Source File: middlewares.py    From freshonions-torscraper with GNU Affero General Public License v3.0 6 votes vote down vote up
def process_request(self, request, spider):

          # don't use this middleware while testing is site is up
        if hasattr(spider, "test") and spider.test=="yes":
            #logger = logging.getLogger()
            #logger.info("Testing mode, dead domains disabled")
            return None

        if not Domain.is_onion_url(request.url):
            return None

        domain = Domain.find_by_url(request.url)
        if not domain or domain.is_up:
            return None

        raise IgnoreRequest('Domain %s is dead, skipping' % domain.host) 
Example #3
Source File: redirect.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \
                [reason]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
                         {'reason': reason, 'redirected': redirected, 'request': request},
                         extra={'spider': spider})
            return redirected
        else:
            logger.debug("Discarding %(request)s: max redirections reached",
                         {'request': request}, extra={'spider': spider})
            raise IgnoreRequest("max redirections reached") 
Example #4
Source File: middlewares.py    From scrapy-autoextract with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def process_exception(self, request, exception, spider):
        if isinstance(exception, (IgnoreRequest, DropItem)):
            return
        if not self._is_enabled_for_request(request):
            return

        autoextract = request.meta.pop(AUTOEXTRACT_META_KEY)
        stop_time = time.time()
        latency = time.time() - autoextract['timing']['start_ts']
        autoextract['timing'].update({'end_ts': stop_time, 'latency': latency})

        # Make sure to log all unknown failures
        logger.warning('AutoExtract failure after %.3fs for %s: %s',
                       latency,
                       autoextract['original_url'],
                       repr(exception),
                       extra={'spider': spider})

        request.meta['autoextract'] = autoextract
        ex_class = global_object_name(exception.__class__)
        self.inc_metric('autoextract/errors/total_count', spider=spider)
        self.inc_metric('autoextract/errors/type_count/%s' % ex_class, spider=spider) 
Example #5
Source File: shell.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
        if isinstance(request_or_url, Request):
            request = request_or_url
        else:
            url = any_to_uri(request_or_url)
            request = Request(url, dont_filter=True, **kwargs)
            if redirect:
                request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
            else:
                request.meta['handle_httpstatus_all'] = True
        response = None
        try:
            response, spider = threads.blockingCallFromThread(
                reactor, self._schedule, request, spider)
        except IgnoreRequest:
            pass
        self.populate_vars(response, request, spider) 
Example #6
Source File: scraper.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _log_download_errors(self, spider_failure, download_failure, request, spider):
        """Log and silence errors that come from the engine (typically download
        errors that got propagated thru here)
        """
        if (isinstance(download_failure, Failure) and
                not download_failure.check(IgnoreRequest)):
            if download_failure.frames:
                logger.error('Error downloading %(request)s',
                             {'request': request},
                             exc_info=failure_to_exc_info(download_failure),
                             extra={'spider': spider})
            else:
                errmsg = download_failure.getErrorMessage()
                if errmsg:
                    logger.error('Error downloading %(request)s: %(errmsg)s',
                                 {'request': request, 'errmsg': errmsg},
                                 extra={'spider': spider})

        if spider_failure is not download_failure:
            return spider_failure 
Example #7
Source File: redirect.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.meta['redirect_reasons'] = request.meta.get('redirect_reasons', []) + \
                [reason]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            logger.debug("Redirecting (%(reason)s) to %(redirected)s from %(request)s",
                         {'reason': reason, 'redirected': redirected, 'request': request},
                         extra={'spider': spider})
            return redirected
        else:
            logger.debug("Discarding %(request)s: max redirections reached",
                         {'request': request}, extra={'spider': spider})
            raise IgnoreRequest("max redirections reached") 
Example #8
Source File: shell.py    From learn_python3_spider with MIT License 6 votes vote down vote up
def fetch(self, request_or_url, spider=None, redirect=True, **kwargs):
        if isinstance(request_or_url, Request):
            request = request_or_url
        else:
            url = any_to_uri(request_or_url)
            request = Request(url, dont_filter=True, **kwargs)
            if redirect:
                request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
            else:
                request.meta['handle_httpstatus_all'] = True
        response = None
        try:
            response, spider = threads.blockingCallFromThread(
                reactor, self._schedule, request, spider)
        except IgnoreRequest:
            pass
        self.populate_vars(response, request, spider) 
Example #9
Source File: robotstxt.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _robots_error(self, failure, netloc):
        if failure.type is not IgnoreRequest:
            key = 'robotstxt/exception_count/{}'.format(failure.type)
            self.crawler.stats.inc_value(key)
        rp_dfd = self._parsers[netloc]
        self._parsers[netloc] = None
        rp_dfd.callback(None) 
Example #10
Source File: middlewares.py    From freshonions-torscraper with GNU Affero General Public License v3.0 5 votes vote down vote up
def process_request(self, request, spider):
        
        parsed_url = urlparse.urlparse(request.url)
        host = parsed_url.hostname
        if self.counter[host] < self.max_pages:
            self.counter[host] += 1
            spider.logger.info('Page count is %d for %s' % (self.counter[host], host))
            return None                   
        else:
            raise IgnoreRequest('MAX_PAGES_PER_DOMAIN reached, filtered %s' % request.url) 
Example #11
Source File: middlewares.py    From freshonions-torscraper with GNU Affero General Public License v3.0 5 votes vote down vote up
def process_request(self, request, spider):

        if not Domain.is_onion_url(request.url):
            return None
        parsed_url = urlparse.urlparse(request.url)
        host = parsed_url.hostname
        subdomains = host.count(".")
        if subdomains > 2:
            raise IgnoreRequest('Too many subdomains (%d > 2)' % subdomains)

        return None 
Example #12
Source File: test_pagestorage.py    From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def test_process_spider_exception(self):
        assert self.instance.counters == {'all': 0, 'error': 0}
        self.instance.save_response = mock.Mock()
        # all conditions are true
        self.instance.on_error_enabled = True
        self.instance.process_spider_exception(
            'err-response', Exception(), self.spider)
        assert self.instance.counters == {'all': 0, 'error': 1}
        # on_error flag is disabled, skipping
        self.instance.on_error_enabled = False
        self.instance.process_spider_exception(
            'err-response', Exception(), self.spider)
        assert self.instance.counters == {'all': 0, 'error': 1}
        # exceeded error limit
        self.instance.on_error_enabled = True
        self.instance.counters['error'] = 11
        self.instance.process_spider_exception(
            'err-response', Exception(), self.spider)
        assert self.instance.counters == {'all': 0, 'error': 11}
        # skip IgnoreRequest
        self.instance.limits['error'] = 12
        self.instance.process_spider_exception(
            'err-response', IgnoreRequest(), self.spider)
        assert self.instance.counters == {'all': 0, 'error': 11}
        # all conditions are true again
        self.instance.limits['all'] = 12
        self.instance.process_spider_exception(
            'err-response', Exception(), self.spider)
        assert self.instance.counters == {'all': 0, 'error': 12} 
Example #13
Source File: scrapy_pagestorage.py    From scrapy-pagestorage with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
def process_spider_exception(self, response, exception, spider):
        if (self.on_error_enabled and
                not isinstance(exception, IgnoreRequest) and
                self.counters['error'] < self.limits['error']):
            self.counters['error'] += 1
            self.save_response(response, spider) 
Example #14
Source File: middlewares.py    From scrapy-crawl-once with MIT License 5 votes vote down vote up
def process_request(self, request, spider):
        if not request.meta.get('crawl_once', self.default):
            return
        if self._get_key(request) in self.db:
            self.stats.inc_value('crawl_once/ignored')
            raise IgnoreRequest() 
Example #15
Source File: middlewares.py    From NewsCrawler with MIT License 5 votes vote down vote up
def process_request(self, request, spider):
        if request.url not in spider.start_urls and (redis_conn.hexists(redis_url_key, request.url) or redis_conn.hexists(redis_invalid_url_key, request.url)):
            logger.info("Skip URL: %s, has been crawled" % request.url)
            raise IgnoreRequest("URL %s has been crawled" % request.url) 
Example #16
Source File: files.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def media_failed(self, failure, request, info):
        if not isinstance(failure.value, IgnoreRequest):
            referer = referer_str(request)
            logger.warning(
                'File (unknown-error): Error downloading %(medianame)s from '
                '%(request)s referred in <%(referer)s>: %(exception)s',
                {'medianame': self.MEDIA_NAME, 'request': request,
                 'referer': referer, 'exception': failure.value},
                extra={'spider': info.spider}
            )

        raise FileException 
Example #17
Source File: httpcache.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def process_request(self, request, spider):
        if request.meta.get('dont_cache', False):
            return

        # Skip uncacheable requests
        if not self.policy.should_cache_request(request):
            request.meta['_dont_cache'] = True  # flag as uncacheable
            return

        # Look for cached response and check if expired
        cachedresponse = self.storage.retrieve_response(spider, request)
        if cachedresponse is None:
            self.stats.inc_value('httpcache/miss', spider=spider)
            if self.ignore_missing:
                self.stats.inc_value('httpcache/ignore', spider=spider)
                raise IgnoreRequest("Ignored request not in cache: %s" % request)
            return  # first time request

        # Return cached response only if not expired
        cachedresponse.flags.append('cached')
        if self.policy.is_cached_response_fresh(cachedresponse, request):
            self.stats.inc_value('httpcache/hit', spider=spider)
            return cachedresponse

        # Keep a reference to cached response to avoid a second cache lookup on
        # process_response hook
        request.meta['cached_response'] = cachedresponse 
Example #18
Source File: middleware.py    From sozlukcrawler with GNU General Public License v2.0 5 votes vote down vote up
def process_request(self, request, spider):
        if 'x-ignore-request' in request.url:
            raise IgnoreRequest()
        elif 'x-error-request' in request.url:
            _ = 1 / 0 
Example #19
Source File: robotstxt.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _logerror(self, failure, request, spider):
        if failure.type is not IgnoreRequest:
            logger.error("Error downloading %(request)s: %(f_exception)s",
                         {'request': request, 'f_exception': failure.value},
                         exc_info=failure_to_exc_info(failure),
                         extra={'spider': spider})
        return failure 
Example #20
Source File: defer.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def mustbe_deferred(f, *args, **kw):
    """Same as twisted.internet.defer.maybeDeferred, but delay calling
    callback/errback to next reactor loop
    """
    try:
        result = f(*args, **kw)
    # FIXME: Hack to avoid introspecting tracebacks. This to speed up
    # processing of IgnoreRequest errors which are, by far, the most common
    # exception in Scrapy - see #125
    except IgnoreRequest as e:
        return defer_fail(failure.Failure(e))
    except Exception:
        return defer_fail(failure.Failure())
    else:
        return defer_result(result) 
Example #21
Source File: files.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def media_failed(self, failure, request, info):
        if not isinstance(failure.value, IgnoreRequest):
            referer = referer_str(request)
            logger.warning(
                'File (unknown-error): Error downloading %(medianame)s from '
                '%(request)s referred in <%(referer)s>: %(exception)s',
                {'medianame': self.MEDIA_NAME, 'request': request,
                 'referer': referer, 'exception': failure.value},
                extra={'spider': info.spider}
            )

        raise FileException 
Example #22
Source File: robotstxt.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _robots_error(self, failure, netloc):
        if failure.type is not IgnoreRequest:
            key = 'robotstxt/exception_count/{}'.format(failure.type)
            self.crawler.stats.inc_value(key)
        rp_dfd = self._parsers[netloc]
        self._parsers[netloc] = None
        rp_dfd.callback(None) 
Example #23
Source File: robotstxt.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def _logerror(self, failure, request, spider):
        if failure.type is not IgnoreRequest:
            logger.error("Error downloading %(request)s: %(f_exception)s",
                         {'request': request, 'f_exception': failure.value},
                         exc_info=failure_to_exc_info(failure),
                         extra={'spider': spider})
        return failure 
Example #24
Source File: robotstxt.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def process_request_2(self, rp, request, spider):
        if rp is None:
            return
        if not rp.can_fetch(to_native_str(self._useragent), request.url):
            logger.debug("Forbidden by robots.txt: %(request)s",
                         {'request': request}, extra={'spider': spider})
            self.crawler.stats.inc_value('robotstxt/forbidden')
            raise IgnoreRequest("Forbidden by robots.txt") 
Example #25
Source File: defer.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def mustbe_deferred(f, *args, **kw):
    """Same as twisted.internet.defer.maybeDeferred, but delay calling
    callback/errback to next reactor loop
    """
    try:
        result = f(*args, **kw)
    # FIXME: Hack to avoid introspecting tracebacks. This to speed up
    # processing of IgnoreRequest errors which are, by far, the most common
    # exception in Scrapy - see #125
    except IgnoreRequest as e:
        return defer_fail(failure.Failure(e))
    except Exception:
        return defer_fail(failure.Failure())
    else:
        return defer_result(result) 
Example #26
Source File: anti_spider.py    From news_spider with MIT License 5 votes vote down vote up
def process_request(self, request, spider):
        # 处理微信反爬(反爬机制一, sogou)
        if spider.name in ['weixin'] and 'antispider' in request.url:
            # 获取来源链接
            redirect_urls = request.meta['redirect_urls']

            # 清理失效 cookies
            cookies_id = request.meta['cookiejar']
            del_cookies(spider.name, cookies_id)

            # spider.log(message='AntiSpider cookies_id: %s; url: %s' % (cookies_id, redirect_urls[0]))
            raise IgnoreRequest(
                'Spider: %s, AntiSpider cookies_id: %s; url: %s' % (spider.name, cookies_id, redirect_urls[0])) 
Example #27
Source File: de_duplication_request.py    From news_spider with MIT License 5 votes vote down vote up
def process_request(self, request, spider):
        if not request.url:
            return None
        channel_id = request.meta.get('channel_id', 0)
        # 处理详情页面(忽略列表页面)与pipeline配合
        if is_dup_detail(request.url, spider.name, channel_id):
            raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" % (spider.name, request.url)) 
Example #28
Source File: middleware.py    From sozlukcrawler with GNU General Public License v2.0 5 votes vote down vote up
def process_response(self, request, response, spider):
        if 'x-ignore-response' in request.url:
            raise IgnoreRequest()
        elif 'x-error-response' in request.url:
            _ = 1 / 0
        else:
            return response 
Example #29
Source File: test_middleware.py    From MaybeDont with MIT License 4 votes vote down vote up
def test_middleware():
    Rq = lambda path: Request(
        'http://example.com{}'.format(path),
        meta={'avoid_dup_content': True})
    Rs = lambda req, body: HtmlResponse(
        req.url, body=body.encode(), request=req)
    mw = AvoidDupContentMiddleware(
        initial_queue_limit=1, threshold=0.5, exploration=0.00)
    spider = Spider()
    req = Rq('/')
    mw.process_request(req, spider)
    mw.process_response(req, Rs(req, ''), spider)
    assert mw.dupe_predictor
    n_dropped = 0
    for i in range(10):
        req = Rq('/viewtopic.php?topic_id={}'.format(i))
        mw.process_request(req, spider)
        mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
        req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i))
        try:
            mw.process_request(req, spider)
        except IgnoreRequest:
            n_dropped += 1
        else:
            mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
        mw.dupe_predictor.log_dupstats(min_dup=0)
    assert n_dropped == 5
    # one request in different order
    req = Rq('/viewtopic.php?topic_id=100&start=0')
    mw.process_request(req, spider)
    mw.process_response(req, Rs(req, ''), spider)
    mw.process_request(Rq('/viewtopic.php?topic_id=200'), spider)
    with pytest.raises(IgnoreRequest):
        mw.process_request(Rq('/viewtopic.php?topic_id=100'), spider)
    # test exploration
    mw.exploration = 0.5
    n_dropped = 0
    n_requests = 0
    for i in range(150, 170):
        req = Rq('/viewtopic.php?topic_id={}'.format(i))
        mw.process_request(req, spider)
        mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
        req = Rq('/viewtopic.php?topic_id={}&start=0'.format(i))
        n_requests += 1
        try:
            mw.process_request(req, spider)
        except IgnoreRequest:
            n_dropped += 1
        else:
            mw.process_response(req, Rs(req, 'Topic {}'.format(i)), spider)
    assert n_dropped > 0
    assert n_dropped < n_requests 
Example #30
Source File: test_middleware.py    From scrapy-crawl-once with MIT License 4 votes vote down vote up
def test_crawl(tmpdir):
    settings = {'CRAWL_ONCE_PATH': str(tmpdir)}
    crawler = get_crawler(settings_dict=settings)
    req1 = scrapy.Request('http://example.com/1', meta={'crawl_once': True})
    req2 = scrapy.Request('http://example.com/2')
    req3 = scrapy.Request('http://example.com/3', meta={'crawl_once': True})

    resp1 = Response(req1.url, request=req1)
    resp2 = Response(req2.url, request=req2)

    with opened_middleware(crawler) as mw:

        # 1. check spider middleware interface
        assert len(mw.db) == 0
        assert crawler.stats.get_value('crawl_once/initial') == 0
        output = [{}, scrapy.Request('http://example.com')]

        # crawl_once is False
        res = list(mw.process_spider_output(resp2, output, crawler.spider))
        assert res == output
        assert len(mw.db) == 0

        # crawl_once is True
        res = list(mw.process_spider_output(resp1, output, crawler.spider))
        assert res == output
        assert len(mw.db) == 1
        assert crawler.stats.get_value('crawl_once/initial') == 0
        assert crawler.stats.get_value('crawl_once/stored') == 1

        # 2. check downloader middleware interface
        assert mw.process_request(req2, crawler.spider) is None
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 0

        with pytest.raises(IgnoreRequest):
            mw.process_request(req1, crawler.spider)
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 1

        assert mw.process_request(req3, crawler.spider) is None
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 1
        assert crawler.stats.get_value('crawl_once/initial') == 0

    crawler = get_crawler(settings_dict=settings)
    with opened_middleware(crawler) as mw2:
        # it reuses the same file, so there are records
        assert len(mw2.db) == 1
        assert crawler.stats.get_value('crawl_once/initial') == 1
        assert mw2.process_request(req2, crawler.spider) is None
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 0
        with pytest.raises(IgnoreRequest):
            mw2.process_request(req1, crawler.spider)
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 1
        assert mw2.process_request(req3, crawler.spider) is None