import json import os from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from scrapy.settings.deprecated import check_deprecated_settings from scrapy.http import Request from scrapy.item import BaseItem from scrapy.utils.spider import iterate_spider_output from gerapy import get_logger from gerapy.server.core.utils import process_request, process_response, process_item logger = get_logger(__name__) class SpiderParser(): """ Spider parser for debugging of one step """ items = [] requests = [] response = None default_callback = 'parse' def __init__(self, settings, spider, args): """ init parser :param settings: :param spider: :param args: """ self.args = args self.spider = spider self.crawler_process = CrawlerRunner(settings) self.spider_loader = self.crawler_process.spider_loader self.spidercls = self.spider_loader.load(self.spider) def get_callback(self, request): """ get callback from obj or rules :param request: :return: """ if getattr(self.spidercls, 'rules', None): rules = self.spidercls.rules # rule_index = request.meta.get('rule', -1) # if rule_index >= 0 and rule_index < len(rules): # rule = rules[rule_index] # return rule.callback for rule in rules: if rule.link_extractor.matches(request.url): return rule.callback return self.default_callback def run_callback(self, response, cb): """ run callback and get items and requests :param response: :param cb: :return: """ items, requests = [], [] for x in iterate_spider_output(cb(response)): if isinstance(x, (BaseItem, dict)): items.append(x) elif isinstance(x, Request): requests.append(x) return items, requests def prepare_request(self, spider, request, args): """ get request :param spider: :param request: :param args: :return: """ def callback(response): """ this callback wraps truly request's callback to get follows :param response: :return: """ # if no callback, use default parse callback of CrawlSpider cb = self.args.callback or self.default_callback # change un-callable callback to callable callback if not callable(cb): cb_method = getattr(spider, cb, None) if callable(cb_method): cb = cb_method # run truly callback to get items and requests, then to this method items, requests = self.run_callback(response, cb) # process request callback for request in requests: request.callback = self.get_callback(request) request.meta['callback'] = request.callback # process items and requests and response self.items += list(map(lambda item: process_item(item), items)) self.requests += list(map(lambda request: process_request(request), requests)) self.response = process_response(response) # update meta if args.meta: request.meta.update(args.meta) # update method request.method = args.method if args.method else request.method # update request body for post or other methods if request.method.lower() != 'get': # to be detailed, temp defined if isinstance(args.body, dict): request = request.replace(body=json.dumps(args.body)) else: request = request.replace(body=args.body) # update headers request.headers = args.headers if args.headers else request.headers # update cookies request.cookies = args.cookies if args.cookies else request.cookies # update dont_filter request.dont_filter = args.filter if hasattr(args, 'filter') else request.dont_filter # update priority request.priority = int(args.priority) if hasattr(args, 'priority') else request.priority # update callback request.callback = callback return request def run(self): """ run main :return: """ request = Request(self.args.url, callback=None) start_requests = lambda spider: [self.prepare_request(spider, request, self.args)] self.spidercls.start_requests = start_requests self.crawler_process.crawl(self.spidercls) if not len(self.crawler_process.crawlers) > 0: return {'ok': False} # init pcrawler self.pcrawler = list(self.crawler_process.crawlers)[0] d = self.crawler_process.join() d.addBoth(lambda _: reactor.stop()) reactor.run() return { 'items': self.items, 'requests': self.requests, 'response': self.response, 'ok': True } def get_follow_requests_and_items(project_path, spider_name, args): """ get follows :param project_path: :param spider_name: :param args: :return: """ work_cwd = os.getcwd() try: os.chdir(project_path) settings = get_project_settings() check_deprecated_settings(settings) sp = SpiderParser(settings, spider_name, args) results = sp.run() return results finally: os.chdir(work_cwd) def get_start_requests(project_path, spider_name): """ get start requests :param project_path: project path :param spider_name: spider name :return: """ work_cwd = os.getcwd() try: # change work dir os.chdir(project_path) # load settings settings = get_project_settings() check_deprecated_settings(settings) runner = CrawlerRunner(settings=settings) # add crawler spider_cls = runner.spider_loader.load(spider_name) runner.crawl(spider_cls) # get crawler crawler = list(runner.crawlers)[0] # get spider by crawler spider = crawler.spider # get start requests requests = list(spider.start_requests()) if not requests and hasattr(spider, 'start'): requests = list(spider.start()) requests = list(map(lambda r: process_request(r), requests)) return {'finished': True, 'requests': requests} finally: os.chdir(work_cwd)