python source code of web

import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError, TooManyRedirects
from raccoon_src.utils.web_server_validator import WebServerValidator
from raccoon_src.lib.storage_explorer import StorageExplorer
from raccoon_src.utils.request_handler import RequestHandler
from raccoon_src.utils.help_utils import HelpUtilities
from raccoon_src.utils.coloring import COLOR, COLORED_COMBOS
from raccoon_src.utils.exceptions import WebAppScannerException, WebServerValidatorException
from raccoon_src.utils.logger import Logger


class WebApplicationScanner:

    def __init__(self, host):
        self.host = host
        self.request_handler = RequestHandler()
        self.web_server_validator = WebServerValidator()
        self.headers = None
        self.robots = None
        self.forms = None
        self.fuzzable_urls = set()
        self.emails = set()
        log_file = HelpUtilities.get_output_path("{}/web_scan.txt".format(self.host.target))
        self.target_dir = "/".join(log_file.split("/")[:-1])
        self.logger = Logger(log_file)
        self.storage_explorer = StorageExplorer(host, self.logger)

    def _detect_cms(self, tries=0):
        """
        Detect CMS using whatcms.org.
        Has a re-try mechanism because false negatives may occur
        :param tries: Count of tries for CMS discovery
        """
        # WhatCMS is under CloudFlare which detects and blocks proxied/Tor traffic, hence normal request.
        page = requests.get(url="https://whatcms.org/?s={}".format(self.host.target))
        soup = BeautifulSoup(page.text, "lxml")
        found = soup.select(".panel.panel-success")
        if found:
            try:
                cms = [a for a in soup.select("a") if "/c/" in a.get("href")][0]
                self.logger.info("{} CMS detected: target is using {}{}{}".format(
                    COLORED_COMBOS.GOOD, COLOR.GREEN, cms.get("title"), COLOR.RESET))
            except IndexError:
                if tries >= 4:
                    return
                else:
                    self._detect_cms(tries=tries + 1)
        else:
            if tries >= 4:
                return
            else:
                self._detect_cms(tries=tries + 1)

    def _cookie_info(self, jar):
        for cookie in jar:
            key = cookie.__dict__.get("name")
            domain = cookie.__dict__.get("domain")
            secure = cookie.__dict__.get("secure")
            http_only = cookie.has_nonstandard_attr("HttpOnly")
            try:
                if domain in self.host.target or self.host.target in domain:
                    if not secure or not http_only:
                        current = "%s Cookie: {%s} -" % (COLORED_COMBOS.GOOD, key)
                        if not secure and not http_only:
                            current += " both secure and HttpOnly flags are not set"
                        elif not secure:
                            current += " secure flag not set"
                        else:
                            current += " HttpOnly flag not set"
                        self.logger.info(current)

            except TypeError:
                continue

    def _server_info(self):
        if self.headers.get("server"):
            self.logger.info("{} Web server detected: {}{}{}".format(
                COLORED_COMBOS.GOOD, COLOR.GREEN, self.headers.get("server"), COLOR.RESET))

    def _x_powered_by(self):
        if self.headers.get("X-Powered-By"):
            self.logger.info("{} X-Powered-By header detected: {}{}{}".format(
                COLORED_COMBOS.GOOD, COLOR.GREEN, self.headers.get("X-Powered-By"), COLOR.RESET))

    def _anti_clickjacking(self):
        if not self.headers.get("X-Frame-Options"):
            self.logger.info(
                "{} X-Frame-Options header not detected - target might be vulnerable to clickjacking".format(
                    COLORED_COMBOS.GOOD)
            )

    def _xss_protection(self):
        xss_header = self.headers.get("X-XSS-PROTECTION")
        if xss_header and "1" in xss_header:
            self.logger.info("{} Found X-XSS-PROTECTION header".format(COLORED_COMBOS.BAD))

    def _cors_wildcard(self):
        if self.headers.get("Access-Control-Allow-Origin") == "*":
            self.logger.info("{} CORS wildcard detected".format(COLORED_COMBOS.GOOD))

    def _robots(self):
        res = self.request_handler.send(
            "GET",
            url="{}://{}:{}/robots.txt".format(
                self.host.protocol,
                self.host.target,
                self.host.port
            )
        )
        if res.status_code != 404 and res.text and "<!DOCTYPE html>" not in res.text:
            self.logger.info("{} Found robots.txt".format(COLORED_COMBOS.GOOD))
            with open("{}/robots.txt".format(self.target_dir), "w") as file:
                file.write(res.text)

    def _sitemap(self):
        res = self.request_handler.send(
            "GET",
            url="{}://{}:{}/sitemap.xml".format(
                self.host.protocol,
                self.host.target,
                self.host.port
            )
        )
        if res.status_code != 404 and res.text and "<!DOCTYPE html>" not in res.text:
            self.logger.info("{} Found sitemap.xml".format(COLORED_COMBOS.GOOD))
            with open("{}/sitemap.xml".format(self.target_dir), "w") as file:
                file.write(res.text)

    def _analyze_hrefs(self, href):
        if all(("?" in href, "=" in href, not href.startswith("mailto:"))):
            if any(((self.host.naked and self.host.naked in href), self.host.target in href, href.startswith("/"))):
                self.fuzzable_urls.add(href)
        elif href.startswith("mailto:"):
            self._add_to_emails(href)

    def _log_fuzzable_urls(self):
        base_target = "{}://{}:{}".format(self.host.protocol, self.host.target, self.host.port)
        for url in self.fuzzable_urls:
            if url.startswith("/"):
                self.logger.debug("\t{}{}".format(base_target, url))
            else:
                self.logger.debug("\t{}".format(url))

    def _log_emails(self):
        for email in self.emails:
            self.logger.debug("\t{}".format(email[7:]))

    def _find_urls(self, soup):
        urls = soup.select("a")
        if urls:
            for url in urls:
                href = url.get("href")
                if href:
                    self._analyze_hrefs(href)

            if self.fuzzable_urls:
                self.logger.info("{} {} fuzzable URLs discovered".format(
                    COLORED_COMBOS.NOTIFY, len(self.fuzzable_urls)))
                self._log_fuzzable_urls()

            if self.emails:
                self.logger.info("{} {} email addresses discovered".format(
                    COLORED_COMBOS.NOTIFY, len(self.emails)))
                self._log_emails()

    def _find_forms(self, soup):
        # TODO: Analyze interesting input names/ids/params
        self.forms = soup.select("form")
        if self.forms:
            self.logger.info("{} {} HTML forms discovered".format(COLORED_COMBOS.NOTIFY, len(self.forms)))
            for form in self.forms:
                form_action = form.get("action")
                if form_action == "#":
                    continue
                form_id = form.get("id")
                form_class = form.get("class")
                form_method = form.get("method")
                self.logger.debug("\tForm details: ID: {}, Class: {}, Method: {}, action: {}".format(
                    form_id, form_class, form_method, form_action
                ))

    def _add_to_emails(self, href):
        self.emails.add(href)

    async def get_web_application_info(self):
        session = self.request_handler.get_new_session()
        try:
            with session:
                # Test if target is serving HTTP requests
                response = session.get(
                    timeout=20,
                    url="{}://{}:{}".format(
                        self.host.protocol,
                        self.host.target,
                        self.host.port
                    )
                )
                self.headers = response.headers
                self._detect_cms()
                self._robots()
                self._sitemap()
                self._server_info()
                self._x_powered_by()
                self._cors_wildcard()
                self._xss_protection()
                self._anti_clickjacking()
                self._cookie_info(session.cookies)

                soup = BeautifulSoup(response.text, "lxml")
                self._find_urls(soup)
                self._find_forms(soup)
                self.storage_explorer.run(soup)

        except (ConnectionError, TooManyRedirects) as e:
            raise WebAppScannerException("Couldn't get response from server.\n"
                                         "Caused due to exception: {}".format(str(e)))

    async def run_scan(self):
        self.logger.info("{} Trying to collect {} web application data".format(COLORED_COMBOS.INFO, self.host))
        try:
            self.web_server_validator.validate_target_webserver(self.host)
            await self.get_web_application_info()
        except WebServerValidatorException:
            self.logger.info(
                "{} Target does not seem to have an active web server on port: {}. "
                "No web application data will be gathered.".format(COLORED_COMBOS.NOTIFY, self.host.port))
            return