python source code of parser

Gerapy-master
- .github
  - ISSUE_TEMPLATE
    - bug_report.md
  - workflows
    - build_docker_image_release.yml
    - build_docker_image_master.yml
    - build.yml
    - sync_docs.yml
- LICENSE
- deploy
  - group_vars
    - all
  - site.yml
  - ansible.cfg
  - hosts
  - site.retry
  - README.md
  - copy_ssh.sh
  - roles
    - python3
      - files
        pip.conf
      - tasks
        main.yml
    - common
      - files
        CentOS-Base.repo
        epel.repo
        RPM-GPG-KEY-EPEL-7
        ntp.conf
      - tasks
        main.yml
    - scrapyd
      - files
        scrapyd.conf
      - tasks
        main.yml
    - supervisor
      - files
        supervisord.conf
      - tasks
        main.yml
    - openresty
      - files
        openresty.repo
        nginx.conf
        scrapyd.passwd
      - tasks
        main.yml
- setup.py
- docker
  - run.sh
  - README.md
  - Dockerfile
- README.md
- requirements.txt
- gerapy
  - spiders
    - crawl.py
    - __init__.py
    - spider.py
    - json.py
  - __version__.py
  - client
    - src
      - layout
        Index.vue
      - utils
        verify.js
        title.js
        regex.js
        rule.js
      - views
        home
        Index.vue
        task
        Status.vue
        Index.vue
        Edit.vue
        Create.vue
        Substance.vue
        client
        Index.vue
        Schedule.vue
        Edit.vue
        Create.vue
        project
        Extractor.vue
        Web.vue
        Storage.vue
        Browser.vue
        Deploy.vue
        Index.vue
        Mongodb.vue
        Proxy.vue
        Edit.vue
        Rule.vue
        Item.vue
        Mysql.vue
        Spider.vue
        Cookies.vue
        Rules.vue
        Parser.vue
        Extractors.vue
        Configure.vue
        login
        Login.vue
      - main.js
      - components
        PanelTitle.vue
        User.vue
        CodeEditor.vue
        Header.vue
        Footer.vue
        LangSwitch.vue
        Left.vue
        BottomBar.vue
        Wrapper.vue
      - langs
        zh.js
        en.js
        index.js
      - store.js
      - router.js
      - App.vue
      - http.js
      - assets
        images
        loading.svg
        scss
        main.scss
        components
        _reset.scss
        _start_loading.scss
        _scroll-bar.scss
        _dialog.scss
        _pm.scss
        _layout.scss
        _animation.scss
        _base.scss
        _editor.scss
        _media.scss
        element.scss
    - public
      - favicon.ico
      - index.html
    - babel.config.js
    - vue.config.js
    - README.md
    - package.json
    - .gitignore
  - templates
    - spiders
      - csvfeed.tmpl
      - basic.tmpl
      - xmlfeed.tmpl
      - __init__.py
      - crawl.tmpl
    - project
      - module
        middlewares.py.tmpl
        spiders
        __init__.py
        pipelines.py.tmpl
        settings.py.tmpl
        __init__.py
        items.py.tmpl
      - scrapy.cfg
    - __init__.py
  - downloadermiddlewares
    - pyppeteer.py
    - __init__.py
    - cookies.py
    - proxy.py
  - pipelines
    - mongodb.py
    - mysql.py
    - __init__.py
  - __init__.py
  - settings.py
  - server
    - manage.py
    - __init__.py
    - server
      - wsgi.py
      - urls.py
      - __init__.py
      - settings.py
    - core
      - build.py
      - apps.py
      - config.py
      - encoder.py
      - scheduler.py
      - templates
        favicon.ico
        index.html
        static
        img
        loading.864753ef.svg
        fonts
        element-icons.732389de.ttf
        fontawesome-webfont.fee66e71.woff
        fontawesome-webfont.af7ae505.woff2
        fontawesome-webfont.674f50d2.eot
        element-icons.535877f5.woff
        fontawesome-webfont.b06871f2.ttf
        js
        chunk-2d0e450e.b1c42466.js
        chunk-2d0e450e.b1c42466.js.map
        chunk-12e7e66d.fc49e6fc.js
        chunk-55d4e541.349979af.js
        chunk-8cbdea46.d4e8ca7b.js
        chunk-736ea204.2f07cc85.js
        chunk-12e7e66d.fc49e6fc.js.map
        chunk-2d0a3191.84665408.js.map
        chunk-193b04f2.6fedc7d2.js
        chunk-5cd9d886.b175c193.js
        chunk-5cd9d886.b175c193.js.map
        chunk-2d0a3191.84665408.js
        chunk-6a3bce8f.6d0fca0d.js
        chunk-3a6102b3.f5479903.js
        chunk-4a7237a2.32f43b23.js
        chunk-55d4e541.349979af.js.map
        chunk-55d43787.70b6383b.js
        chunk-4a7237a2.32f43b23.js.map
        chunk-3a6102b3.f5479903.js.map
        app.747409e0.js
        chunk-612d9832.9dde355d.js
        chunk-193b04f2.6fedc7d2.js.map
        chunk-55d43787.70b6383b.js.map
        app.747409e0.js.map
        chunk-5cb4de70.697d67bf.js
        chunk-8cbdea46.d4e8ca7b.js.map
        chunk-5cb4de70.697d67bf.js.map
        chunk-736ea204.2f07cc85.js.map
        css
        chunk-74cc1e94.47f3c0a9.css
        chunk-612d9832.ee49a942.css.map
        chunk-5cd9d886.212b12ed.css.map
        chunk-12e7e66d.94b672e1.css
        chunk-3a6102b3.46e1b93d.css.map
        chunk-vendors.56005def.css.map
        chunk-6a3bce8f.ca9c0275.css
        chunk-4a7237a2.32bdda61.css.map
        chunk-4a7237a2.32bdda61.css
        chunk-vendors.56005def.css
        chunk-74cc1e94.47f3c0a9.css.map
        chunk-736ea204.347b1790.css
        chunk-6a3bce8f.ca9c0275.css.map
        chunk-612d9832.ee49a942.css
        chunk-736ea204.347b1790.css.map
        chunk-3a6102b3.46e1b93d.css
        chunk-5cb4de70.1fdb3cbd.css
        chunk-5cd9d886.212b12ed.css
        chunk-12e7e66d.94b672e1.css.map
        chunk-39423506.15941b6d.css.map
        chunk-39423506.15941b6d.css
        chunk-5cb4de70.1fdb3cbd.css.map
      - models.py
      - views.py
      - response.py
      - urls.py
      - tests.py
      - __init__.py
      - utils.py
      - migrations
        0009_auto_20180711_2332.py
        0008_auto_20180703_2305.py
        0001_initial.py
        0010_auto_20191027_2040.py
        0003_auto_20180123_2304.py
        0002_auto_20180119_1210.py
        __init__.py
        0007_task_trigger.py
        0004_auto_20180124_0032.py
        0006_auto_20180131_1235.py
        0005_auto_20180131_1210.py
      - parser.py
      - admin.py
      - middlewares.py
      - time.py
    - .gitignore
  - cmd
    - init.py
    - parse.py
    - generate.py
    - __init__.py
    - initadmin.py
- .gitignore
- docs
  - en
    - Makefile
    - source
      - docker.md
      - usage.md
      - license.md
      - introduction.md
      - contributing.md
      - installation.md
      - maintainers.md
      - index.rst
      - conf.py
    - make.bat
    - build
      - doctrees
        contributing.doctree
        environment.pickle
        license.doctree
        usage.doctree
        maintainers.doctree
        installation.doctree
        introduction.doctree
        docker.doctree
        index.doctree
      - html
        maintainers.html
        installation.html
        introduction.html
        _sources
        index.rst.txt
        introduction.md.txt
        maintainers.md.txt
        installation.md.txt
        contributing.md.txt
        license.md.txt
        usage.md.txt
        docker.md.txt
        searchindex.js
        docker.html
        usage.html
        .buildinfo
        objects.inv
        genindex.html
        index.html
        search.html
        license.html
        _static
        jquery.js
        basic.css
        pygments.css
        searchtools.js
        documentation_options.js
        language_data.js
        doctools.js
        fonts
        fontawesome-webfont.woff2
        fontawesome-webfont.woff
        fontawesome-webfont.eot
        RobotoSlab-Bold.ttf
        Lato
        lato-bold.woff2
        lato-bolditalic.woff2
        lato-italic.woff2
        lato-regular.woff2
        RobotoSlab
        roboto-slab-v7-bold.woff2
        roboto-slab-v7-regular.eot
        roboto-slab-v7-bold.ttf
        roboto-slab-v7-bold.eot
        roboto-slab-v7-regular.woff
        roboto-slab-v7-bold.woff
        roboto-slab-v7-regular.ttf
        roboto-slab-v7-regular.woff2
        Inconsolata.ttf
        Inconsolata-Bold.ttf
        RobotoSlab-Regular.ttf
        fontawesome-webfont.ttf
        Inconsolata-Regular.ttf
        translations.js
        underscore-1.3.1.js
        js
        theme.js
        modernizr.min.js
        underscore.js
        css
        badge_only.css
        theme.css
        contributing.html
  - README.md
  - requirements.txt
  - zh_cn
    - Makefile
    - source
      - docker.md
      - usage.md
      - license.md
      - introduction.md
      - contributing.md
      - installation.md
      - maintainers.md
      - index.rst
      - conf.py
    - make.bat
    - build
      - doctrees
        contributing.doctree
        environment.pickle
        license.doctree
        usage.doctree
        maintainers.doctree
        installation.doctree
        introduction.doctree
        docker.doctree
        index.doctree
      - html
        maintainers.html
        installation.html
        introduction.html
        _sources
        index.rst.txt
        introduction.md.txt
        maintainers.md.txt
        installation.md.txt
        contributing.md.txt
        license.md.txt
        usage.md.txt
        docker.md.txt
        searchindex.js
        docker.html
        usage.html
        .buildinfo
        objects.inv
        genindex.html
        index.html
        search.html
        license.html
        _static
        jquery.js
        basic.css
        pygments.css
        searchtools.js
        documentation_options.js
        language_data.js
        doctools.js
        fonts
        fontawesome-webfont.woff2
        fontawesome-webfont.woff
        fontawesome-webfont.eot
        RobotoSlab-Bold.ttf
        Lato
        lato-bold.woff2
        lato-bolditalic.woff2
        lato-italic.woff2
        lato-regular.woff2
        RobotoSlab
        roboto-slab-v7-bold.woff2
        roboto-slab-v7-regular.eot
        roboto-slab-v7-bold.ttf
        roboto-slab-v7-bold.eot
        roboto-slab-v7-regular.woff
        roboto-slab-v7-bold.woff
        roboto-slab-v7-regular.ttf
        roboto-slab-v7-regular.woff2
        Inconsolata.ttf
        Inconsolata-Bold.ttf
        RobotoSlab-Regular.ttf
        fontawesome-webfont.ttf
        Inconsolata-Regular.ttf
        translations.js
        underscore-1.3.1.js
        js
        theme.js
        modernizr.min.js
        underscore.js
        css
        badge_only.css
        theme.css
        contributing.html
- MANIFEST.in

import json
import os
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.settings.deprecated import check_deprecated_settings
from scrapy.http import Request
from scrapy.item import BaseItem
from scrapy.utils.spider import iterate_spider_output
from gerapy import get_logger
from gerapy.server.core.utils import process_request, process_response, process_item

logger = get_logger(__name__)


class SpiderParser():
    """
    Spider parser for debugging of one step
    """
    items = []
    requests = []
    response = None
    default_callback = 'parse'
    
    def __init__(self, settings, spider, args):
        """
        init parser
        :param settings:
        :param spider:
        :param args:
        """
        self.args = args
        self.spider = spider
        self.crawler_process = CrawlerRunner(settings)
        self.spider_loader = self.crawler_process.spider_loader
        self.spidercls = self.spider_loader.load(self.spider)
    
    def get_callback(self, request):
        """
        get callback from obj or rules
        :param request:
        :return:
        """
        if getattr(self.spidercls, 'rules', None):
            rules = self.spidercls.rules
            # rule_index = request.meta.get('rule', -1)
            # if rule_index >= 0 and rule_index < len(rules):
            #     rule = rules[rule_index]
            #     return rule.callback
            for rule in rules:
                if rule.link_extractor.matches(request.url):
                    return rule.callback
        return self.default_callback
    
    def run_callback(self, response, cb):
        """
        run callback and get items and requests
        :param response:
        :param cb:
        :return:
        """
        items, requests = [], []
        for x in iterate_spider_output(cb(response)):
            if isinstance(x, (BaseItem, dict)):
                items.append(x)
            elif isinstance(x, Request):
                requests.append(x)
        return items, requests
    
    def prepare_request(self, spider, request, args):
        """
        get request
        :param spider:
        :param request:
        :param args:
        :return:
        """
        
        def callback(response):
            """
            this callback wraps truly request's callback to get follows
            :param response:
            :return:
            """
            # if no callback, use default parse callback of CrawlSpider
            cb = self.args.callback or self.default_callback
            
            # change un-callable callback to callable callback
            if not callable(cb):
                cb_method = getattr(spider, cb, None)
                if callable(cb_method):
                    cb = cb_method
                    
            # run truly callback to get items and requests, then to this method
            items, requests = self.run_callback(response, cb)
            
            # process request callback
            for request in requests:
                request.callback = self.get_callback(request)
                request.meta['callback'] = request.callback
            
            # process items and requests and response
            self.items += list(map(lambda item: process_item(item), items))
            self.requests += list(map(lambda request: process_request(request), requests))
            self.response = process_response(response)
        
        # update meta
        if args.meta:
            request.meta.update(args.meta)
        
        # update method
        request.method = args.method if args.method else request.method
        
        # update request body for post or other methods
        if request.method.lower() != 'get':
            # to be detailed, temp defined
            if isinstance(args.body, dict):
                request = request.replace(body=json.dumps(args.body))
            else:
                request = request.replace(body=args.body)
                
        # update headers
        request.headers = args.headers if args.headers else request.headers
        
        # update cookies
        request.cookies = args.cookies if args.cookies else request.cookies
        
        # update dont_filter
        request.dont_filter = args.filter if hasattr(args, 'filter') else request.dont_filter
        
        # update priority
        request.priority = int(args.priority) if hasattr(args, 'priority') else request.priority
        
        # update callback
        request.callback = callback
        
        return request
    
    def run(self):
        """
        run main
        :return:
        """
        request = Request(self.args.url, callback=None)
        start_requests = lambda spider: [self.prepare_request(spider, request, self.args)]
        self.spidercls.start_requests = start_requests
        self.crawler_process.crawl(self.spidercls)
        if not len(self.crawler_process.crawlers) > 0:
            return {'ok': False}
        # init pcrawler
        self.pcrawler = list(self.crawler_process.crawlers)[0]
        d = self.crawler_process.join()
        d.addBoth(lambda _: reactor.stop())
        reactor.run()
        return {
            'items': self.items,
            'requests': self.requests,
            'response': self.response,
            'ok': True
        }


def get_follow_requests_and_items(project_path, spider_name, args):
    """
    get follows
    :param project_path:
    :param spider_name:
    :param args:
    :return:
    """
    work_cwd = os.getcwd()
    try:
        os.chdir(project_path)
        settings = get_project_settings()
        check_deprecated_settings(settings)
        sp = SpiderParser(settings, spider_name, args)
        results = sp.run()
        return results
    finally:
        os.chdir(work_cwd)


def get_start_requests(project_path, spider_name):
    """
    get start requests
    :param project_path: project path
    :param spider_name: spider name
    :return:
    """
    work_cwd = os.getcwd()
    try:
        # change work dir
        os.chdir(project_path)
        # load settings
        settings = get_project_settings()
        check_deprecated_settings(settings)
        runner = CrawlerRunner(settings=settings)
        # add crawler
        spider_cls = runner.spider_loader.load(spider_name)
        runner.crawl(spider_cls)
        # get crawler
        crawler = list(runner.crawlers)[0]
        # get spider by crawler
        spider = crawler.spider
        # get start requests
        requests = list(spider.start_requests())
        if not requests and hasattr(spider, 'start'):
            requests = list(spider.start())
        requests = list(map(lambda r: process_request(r), requests))
        return {'finished': True, 'requests': requests}
    finally:
        os.chdir(work_cwd)