python source code of async

# coding:utf-8
# author: al0ne
# https://github.com/al0ne

import asyncio
import concurrent.futures
import glob
import itertools
import logging
import platform
import random
import re
import time

import aiohttp
import chardet

from lib.Requests import Requests
from lib.cli_output import console
from lib.random_header import get_ua
from lib.settings import *
from lib.sqldb import Sqldb
from lib.verify import verify_ext

if platform.system() != 'Windows':
    import uvloop
    
    asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
    LIMIT = 800
else:
    LIMIT = 200


class Cartesian(object):
    def __init__(self):
        self._data_list = []
    
    # 添加生成笛卡尔积的数据列表
    def add_data(self, data=[]):
        self._data_list.append(data)
    
    # 计算笛卡尔积
    def build(self):
        urls = []
        for item in itertools.product(*self._data_list):
            urls.append(item[0] + item[1])
        return urls


class DirScan:
    def __init__(self, dbname, apps, host, title):
        self.dbname = dbname
        self.apps = apps
        self.title = title
        self.headers = get_ua()
        self.outjson = []
        self.wordlist = []
        self.host = host
        self.urls = self.get_urls(self.host)
        self.req = Requests()
        
        # url请求随机顺序 避免溯源
        random.shuffle(self.urls)
    
    def get_urls(self, domain):
        wordlist = []
        domain = domain.replace('http://', '').replace('https://', '').rstrip('/')
        ext = verify_ext(self.apps)
        ext = list(map(lambda x: '.' + x, ext))
        path = []
        for txt in glob.glob(r'data/path/*.txt'):
            with open(txt, 'r', encoding='utf-8') as f:
                for i in f.readlines():
                    path.append(i.strip())
        domain2 = re.sub(r'\.', '_', domain)
        domain3 = domain.strip('www.')
        leaks = Cartesian()
        leaks.add_data([
            '/www', '/1', '/2016', '/2017', '/2018', '/2019', '/wwwroot', '/backup', '/index', '/web', '/test', '/tmp',
            '/default', '/temp', '/data', '/dump', '/database', '/web', '/ftp', '/sql', '/data', '/website', '/upload',
            '/bin', '/bbs', '/www1', '/www2', '/log', '/site', '/2', '/htdocs', '/w', '/back', '/admin', '/export',
            '/extra', '/file', '/qq', '/up', '/config', '/' + domain, '/userlist', '/dev', '/a', '/123', '/sysadmin',
            '/localhost', '/shop', '/sys', '/root', '/install', '/webserver', '/users', '/111', '/access', '/old', '/i',
            '/vip', '/index.php', '/global', '/key', '/webroot', '/out', '/server', '/db', '/备份', '/新建文件夹', '/网站',
            '/uc_server', '/beifen', '/joomla', '/login', '/crack', '/wangzhan', '/' + domain2, '/' + domain3, '/list'
        ])
        leaks.add_data([
            '.tar.gz', '.zip', '.rar', '.sql', '.7z', '.bak', '.tar', '.txt', '.tgz', '.swp', '~', '.old', '.tar.bz2',
            '.data', '.csv', '.log', '.tmp', '.gz', '.bak~', '.sh', '.rar', '.war', '.bk', '.tmp', '.arj', '.xz',
            '.bz2', '.apk'
        ])
        path.extend(leaks.build())
        index = Cartesian()
        index.add_data([
            '/1', '/l', '/info', '/index', '/admin', '/login', '/qq', '/q', '/search', '/install', '/default', '/cmd',
            '/upload', '/test', '/shell', '/p', '/a', '/userinfo', '/api', '/common', '/web', '/manage', '/loading',
            '/left', '/zzzz', '/welcome', '/ma', '/66', '/c', '/2', '/fuck', '/11', '/error', '/403', '/123', '/3',
            '/css', '/x', '/md5', '/xx', '/out', '/config', '/asd', '/result', '/conn', '/password', '/cmdshell', '/k',
            '/s', '/test1', '/up', '/xxxx', '/exp', '/shell1', '/shell2', '/i', '/aa', '/2011', '/2012', '/2013',
            '/2016', '/2017', '/2018', '/2019', '/dama', '/list', '/list2', '/caidao', '/anonymous', '/xianf'
        ])
        index.add_data(ext)
        path.extend(index.build())
        path.extend(wordlist)
        return list(set(path))
    
    def _verify(self, url, code, contype, title, length, goto, text):
        
        # 验证404页面
        try:
            
            result = True
            
            if code in BLOCK_CODE:
                result = False
            
            if contype in BLOCK_CONTYPE:
                result = False
            
            # 访问过快可能会出现拦截,从title过滤
            if re.search(r'Error|antispam|IP限制|访问禁止|小伙伴出错了|文档已移动|活动暂时关闭|Object moved|网站防火墙|访问被拦截|系统发生错误|404', title):
                result = False
            
            # 扫描的url标题里不能出现域名
            if re.search(
                r'((?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+(?:biz|cc|club|cn|com|co|edu|fun|group|info|ink|kim|link|live|ltd|mobi|net|online|org|pro|pub|red|ren|shop|site|store|tech|top|tv|vip|wang|wiki|work|xin|xyz|me))',
                title):
                result = False
            
            # 被WAF拦截的页面
            if re.search(r'Blocked by[\w\s]+waf|^false$|^post2$', text):
                result = False
            
            # 扫描url标题不能等于网站标题
            if self.title == title:
                result = False
            
            if title == 'None' and code == 0 and contype == 'None':
                result = False
            
            if re.sub('http://', 'https://', url) == goto:
                result = False
            
            for i in PAGE_404:
                if i in text:
                    result = False
                    break
            
            # 有些302跳转会在location里出现error或者404等关键字
            if re.search(r'forbidden|error|404', goto):
                result = False
            
            if code == 302 or code == 301:
                result = False
            
            # 跳转到路径
            if re.search(r'http://.*/\w+/$', goto):
                result = True
            
            # 有些报错页面不能排除掉
            if re.search(r'系统发生错误|PHP Error|PHP Parse error|database error|Error Message|Index of|mysql error', text):
                result = True
            
            # 文件内容类型对不上的情况
            if not (not re.search(
                r'\.bak$|\.zip$|\.rar$|\.7z$|\.old$|\.htaccess$|\.csv$|\.txt$|\.sql$|\.tar$|\.tar.gz$|\.tgz$|\.log$|\.gz$|\.data$|\.bz2$|\.sh$|\w+~$|\.bzr|\.DS_Store|\.xz$|\.db$',
                url) or not (contype == 'html')):
                result = False
            
            return result
        
        except Exception as e:
            logging.exception(e)
            return False
    
    def save(self, urls):
        Sqldb(self.dbname).get_urls(urls)
    
    async def scan(self, host, url, session):
        try:
            async with session.get(
                host + url,
                headers=self.headers,
                allow_redirects=False,
            ) as resp:
                # 判断是不是302跳转
                if resp.headers.get('Location'):
                    goto = resp.headers.get('Location')
                else:
                    goto = 'test'
                # 判断内容类型
                if resp.headers.get('Content-Type'):
                    contype = re.sub(r'\w+/', '', str(resp.headers.get('Content-Type')))
                    contype = re.sub(r';.*', '', contype)
                else:
                    contype = 'None'
                
                # 判断是不是网页或者文本，如果是其他文件coding将置为空
                ishtml = False
                
                try:
                    if contype == 'html':
                        ishtml = True
                        content = await resp.content.read(20000)
                    else:
                        content = b''
                except aiohttp.client_exceptions.ClientPayloadError:
                    pass
                
                # 获取html编码并解码
                if ishtml:
                    try:
                        coding = chardet.detect(content).get('encoding')
                        if coding:
                            # 如果能获取到编码则获取响应体并匹配标题
                            text = content.decode(coding)
                            title = re.search('(?<=<title>).*(?=</title>)', text)
                            # 匹配标题，如果不能匹配到标题则去响应体的前30位当做标题
                            if title:
                                title = title.group()
                            else:
                                title = text[:35]
                        else:
                            text = 'Other'
                            title = None
                    except Exception as e:
                        text = 'Other'
                        title = None
                        logging.exception(e)
                else:
                    text = 'Other'
                    title = None
                
                if title is None:
                    title = 'None'
                
                title = re.sub(r'\n|\t', '', title)
                
                # 获取响应长度
                rsp_len = resp.headers.get('Content-Length')
                if not rsp_len:
                    rsp_len = len(content)
                
                host2 = host.replace('http://', '').replace('https://', '').rstrip('/')
                
                if self._verify(url, resp.status, contype, title, rsp_len, goto, text):
                    console('URLS', host2, url + '\n')
                    data = {
                        host2: {
                            "rsp_code": resp.status,
                            "rsp_len": rsp_len,
                            "title": title,
                            "contype": contype,
                            "url": host + url
                        }
                    }
                    self.outjson.append(data)
        
        except (aiohttp.client_exceptions.ServerTimeoutError, ConnectionResetError,
                aiohttp.client_exceptions.ClientConnectorError, UnicodeDecodeError,
                aiohttp.client_exceptions.ClientOSError, aiohttp.client_exceptions.ServerDisconnectedError,
                concurrent.futures._base.TimeoutError, aiohttp.client_exceptions.ClientPayloadError):
            pass
        
        except Exception as e:
            logging.exception(e)
        
        return 'OK'
    
    async def run(self, host):
        tasks = []
        # 默认limit=100，enable_cleanup_closed设置为True防止ssl泄露，ttl_dns_cache调高dns缓存
        conn = aiohttp.TCPConnector(
            limit=LIMIT,
            enable_cleanup_closed=True,
            ttl_dns_cache=100,
            ssl=False,
        )
        timeout = aiohttp.ClientTimeout(total=60, connect=2)
        async with aiohttp.ClientSession(connector=conn, timeout=timeout) as session:
            for url in self.urls:
                task = asyncio.ensure_future(self.scan(host, url, session))
                tasks.append(task)
            # gather方法是所有请求完成后才有输出
            _ = await asyncio.gather(*tasks)
            # for i in asyncio.as_completed(tasks):  # 类似于线程池中的task一样
            #     answer = await i
    
    # 创建启动任务
    def pool(self):
        loop = asyncio.get_event_loop()
        future = asyncio.ensure_future(self.run(self.host))
        loop.run_until_complete(future)
        
        self.save(self.outjson)


if __name__ == "__main__":
    start_time = time.time()
    scan = DirScan('result', ['php'], 'http://127.0.0.1', '')
    print(len(scan.get_urls('www.baidu.com')))
    end_time = time.time()
    print('\nrunning {0:.3f} seconds...'.format(end_time - start_time))