Python scrapy.crawler.CrawlerRunner() Examples

The following are 12 code examples of scrapy.crawler.CrawlerRunner(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module scrapy.crawler , or try the search function .
Example #1
Source File: spiders.py    From autologin with Apache License 2.0 7 votes vote down vote up
def crawl_runner(extra_settings=None):
    settings = base_settings.copy()
    if extra_settings is not None:
        settings.update(extra_settings, priority='cmdline')
    if settings.get('SPLASH_URL'):
        settings['DUPEFILTER_CLASS'] = 'scrapy_splash.SplashAwareDupeFilter'
        settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
            'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
            'scrapy_splash.SplashCookiesMiddleware': 723,
            'scrapy_splash.SplashMiddleware': 725,
            'scrapy.downloadermiddlewares.httpcompression'
                '.HttpCompressionMiddleware': 810,
        })
    else:
        settings.setdefault('DOWNLOADER_MIDDLEWARES', {}).update({
            'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': None,
            'autologin.middleware.ExposeCookiesMiddleware': 700,
        })
    return CrawlerRunner(settings) 
Example #2
Source File: scheduler.py    From haipproxy with MIT License 5 votes vote down vote up
def crawler_start(usage, tasks):
    """Start specified spiders or validators from cmd with scrapy core api.
    There are four kinds of spiders: common, ajax, gfw, ajax_gfw. If you don't
    assign any tasks, all these spiders will run.
    """
    if usage == 'crawler':
        maps = CRAWLER_TASK_MAPS
        origin_spiders = DEFAULT_CRAWLERS
    else:
        maps = TEMP_TASK_MAPS
        origin_spiders = DEFAULT_VALIDATORS

    if not tasks:
        spiders = origin_spiders
    else:
        spiders = list()
        cases = list(map(BaseCase, origin_spiders))
        for task in tasks:
            for case in cases:
                if case.check(task, maps):
                    spiders.append(case.spider)
                    break
            else:
                # crawler_logger.warning('spider task {} is an invalid task, the allowed tasks are {}'.format(
                #     task, list(maps.keys())))
                pass
    if not spiders:
        #crawler_logger.warning('no spider starts up, please check your task input')
        return

    settings = get_project_settings()
    configure_logging(settings)
    runner = CrawlerRunner(settings)
    for spider in spiders:
        runner.crawl(spider)
    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
    reactor.run() 
Example #3
Source File: online.py    From scrapy-cluster with MIT License 5 votes vote down vote up
def test_crawler_process(self):
        runner = CrawlerRunner(self.settings)
        d = runner.crawl(CustomSpider)
        d.addBoth(lambda _: reactor.stop())
        # add crawl to redis
        key = "test-spider:dmoztools.net:queue"
        self.redis_conn.zadd(key, self.example_feed, -99)

        # run the spider, give 20 seconds to see the url, crawl it,
        # and send to kafka. Then we kill the reactor
        def thread_func():
            time.sleep(20)
            reactor.stop()

        thread = threading.Thread(target=thread_func)
        thread.start()
        reactor.run()

        message_count = 0
        m = next(self.consumer)

        if m is None:
            pass
        else:
            the_dict = json.loads(m.value)
            if the_dict is not None and the_dict['appid'] == 'test' \
                    and the_dict['crawlid'] == 'abc12345':
                message_count += 1

        self.assertEquals(message_count, 1) 
Example #4
Source File: full_analysis.py    From jd_analysis with GNU Lesser General Public License v3.0 5 votes vote down vote up
def runspider(self):
        configure_logging(install_root_handler = False)
        s = get_project_settings()
        runner = CrawlerRunner(settings = s)

        @defer.inlineCallbacks
        def crawl(**spargs):
            yield runner.crawl(JDItemInfoSpider, **spargs)
            yield runner.crawl(JDCommentSpider, **spargs)
            reactor.stop()

        crawl(**self.spargs)
        reactor.run()  # the script will block here until the last crawl call is finished

    # 调度分析 
Example #5
Source File: run.py    From PythonScrapyBasicSetup with MIT License 5 votes vote down vote up
def run():
    configure_logging()
    # importing project settings for further usage
    # mainly because of the middlewares
    settings = get_project_settings()
    runner = CrawlerRunner(settings)

    # running spiders sequentially (non-distributed)
    @defer.inlineCallbacks
    def crawl():
        yield runner.crawl(IPTesterSpider)
        yield runner.crawl(UATesterSpider)
        reactor.stop()

    crawl()
    reactor.run() # block until the last call 
Example #6
Source File: test.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def get_crawler(spidercls=None, settings_dict=None):
    """Return an unconfigured Crawler object. If settings_dict is given, it
    will be used to populate the crawler settings with a project level
    priority.
    """
    from scrapy.crawler import CrawlerRunner
    from scrapy.spiders import Spider

    runner = CrawlerRunner(settings_dict)
    return runner.create_crawler(spidercls or Spider) 
Example #7
Source File: test.py    From learn_python3_spider with MIT License 5 votes vote down vote up
def get_crawler(spidercls=None, settings_dict=None):
    """Return an unconfigured Crawler object. If settings_dict is given, it
    will be used to populate the crawler settings with a project level
    priority.
    """
    from scrapy.crawler import CrawlerRunner
    from scrapy.spiders import Spider

    runner = CrawlerRunner(settings_dict)
    return runner.create_crawler(spidercls or Spider) 
Example #8
Source File: pipelines.py    From company2vec with MIT License 5 votes vote down vote up
def return_spider_output(output):

    """
    Turns scrapy output into dictionaries
    :param output: items scraped by CrawlerRunner
    :type output: dict

    :return: json with list of items
    """

    # this just turns items into dictionaries
    return [dict(item) for item in output] 
Example #9
Source File: parser.py    From Gerapy with MIT License 5 votes vote down vote up
def __init__(self, settings, spider, args):
        """
        init parser
        :param settings:
        :param spider:
        :param args:
        """
        self.args = args
        self.spider = spider
        self.crawler_process = CrawlerRunner(settings)
        self.spider_loader = self.crawler_process.spider_loader
        self.spidercls = self.spider_loader.load(self.spider) 
Example #10
Source File: parser.py    From Gerapy with MIT License 5 votes vote down vote up
def get_start_requests(project_path, spider_name):
    """
    get start requests
    :param project_path: project path
    :param spider_name: spider name
    :return:
    """
    work_cwd = os.getcwd()
    try:
        # change work dir
        os.chdir(project_path)
        # load settings
        settings = get_project_settings()
        check_deprecated_settings(settings)
        runner = CrawlerRunner(settings=settings)
        # add crawler
        spider_cls = runner.spider_loader.load(spider_name)
        runner.crawl(spider_cls)
        # get crawler
        crawler = list(runner.crawlers)[0]
        # get spider by crawler
        spider = crawler.spider
        # get start requests
        requests = list(spider.start_requests())
        if not requests and hasattr(spider, 'start'):
            requests = list(spider.start())
        requests = list(map(lambda r: process_request(r), requests))
        return {'finished': True, 'requests': requests}
    finally:
        os.chdir(work_cwd) 
Example #11
Source File: crawler.py    From fp-server with MIT License 5 votes vote down vote up
def init_crawler_runner():
    crochet.setup()
    init_scrapy_env()
    settings = get_project_settings()
    global CRAWLER_RUNNER
    CRAWLER_RUNNER = CrawlerRunner(settings)
    logger.info('Initialized crawler runner: %s' % CRAWLER_RUNNER)


# TODO: move these to config file? 
Example #12
Source File: run.py    From openslack-crawler with Apache License 2.0 5 votes vote down vote up
def run_spider2(spider, *args):
    configure_logging()
    runner = CrawlerRunner(get_project_settings())
    runner.crawl(spider, *args)
    runner.crawl(spider, *args)
    d = runner.join()
    # d = runner.crawl(spider, *args)
    d.addBoth(lambda _: reactor.stop())

    reactor.run()  # the script will block here until all crawling jobs are finished