python source code of dirsc

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import requests
from multiprocessing.dummy import Pool
from multiprocessing import TimeoutError
from lib.parse.args import parse_port_list
from lib.utils.brute import generate_bflist
from lib.utils.subtasks import generate_compbrute_subtask
from lib.utils.subtasks import generate_dictbrute_subtask
from lib.core.logger import LOG
from lib.core.exception import SwarmUseException
from lib.core.exception import SwarmParseException


def add_cli_args(cli_parser):
    # dir scan options
    dir_scan=cli_parser.add_argument_group('Directory Scan',
        'These option can be used to customize swarm action of directory scan')
    dir_scan.add_argument('--dir-http-port',dest='dir_http_port',metavar='PORT',
            help="Separated by comma if you need multiple ports")
    dir_scan.add_argument('--dir-https-port',dest='dir_https_port',metavar='PORT',
            help="Separated by comma if you need multiple ports")
    dir_scan.add_argument('--dir-compbrute',dest='dir_compbrute',action='store_true',
            help='Use complete brute force without dictionary on target')
    dir_scan.add_argument('--dir-charset',dest='dir_charset',metavar='SET',
            help='Charset used for complete brute foce')
    dir_scan.add_argument('--dir-len',dest='dir_len',metavar='LEN',
            help='Length interval of directory name or file name')
    dir_scan.add_argument('--dir-dict',dest='dir_dict',metavar='PATH',
            help='Path to dictionary used for directory scan')
    dir_scan.add_argument('--dir-maxdepth',dest='dir_maxdepth',metavar='NUM',type=int,
            help='Max depth in directory and file scan')
    dir_scan.add_argument('--dir-timeout',dest='dir_timeout',metavar='TIME',type=float,
            help='Timeout option for directory scan')
    dir_scan.add_argument('--dir-not-exist',dest='dir_not_exist',metavar='FLAG',
            help="Separated by double comma if you need multiple flags")
    dir_scan.add_argument('--dir-quick-scan',dest='dir_quick_scan',action='store_true',
            help='Use HEAD method instead of GET in scan')

def parse_conf(args,conf_parser):
    # directory and file scan options
    args.dir_http_port=conf_parser.get('Directory Scan','dir_http_port')
    args.dir_https_port=conf_parser.get('Directory Scan','dir_https_port')
    args.dir_compbrute=conf_parser.getboolean('Directory Scan','dir_compbrute')
    args.dir_charset=conf_parser.get('Directory Scan','dir_charset')
    args.dir_len=conf_parser.get('Directory Scan','dir_len')
    args.dir_dict=conf_parser.get('Directory Scan','dir_dict')
    args.dir_maxdepth=conf_parser.getint('Directory Scan','dir_maxdepth')
    args.dir_timeout=conf_parser.getfloat('Directory Scan','dir_timeout')
    args.dir_not_exist=conf_parser.get('Directory Scan','dir_not_exist')
    args.dir_quick_scan=conf_parser.getboolean('Directory Scan','dir_quick_scan')

class Master(object):
    """
    Directories and files scan task distributor and task result handler.

    Use a dict with domain name as keys and a list contains subdomain names as values to store
    scan result.
    For example:

    {'github.com':['gist.github.com','XX.github.com'],
     'stackoverflow.com':['XX.stackoverflow.com',]} 

    Task Format:
        Task:
        target|dict|dict_path|start_line|scan_lines
        target|comp|charset|begin_str|end_str
        Result:
        URL;URL;URL
        no dir
    Example:
        Put task:
        github.com|dict|./dictionary/dir.dict|1000|900
        github.com|comp|ABCabc|A|cccc
        Get result:
        no dir
        https://github.com/something;https://github.com/something/something
    """
    def __init__(self, args):
        super(Master, self).__init__()
        self._args = args    

        if self._args.dir_http_port=='' and self._args.dir_https_port=='':
            raise SwarmUseException('at least one http or https port should be specified')
        
        http_portl=[]
        https_portl=[]
        # just used for check validity here
        try:
            if self._args.dir_http_port!='':
                http_portl=parse_port_list(self._args.dir_http_port)
            if self._args.dir_https_port!='':
                https_portl=parse_port_list(self._args.dir_https_port)
        except SwarmParseException as e:
            raise SwarmUseException('invalid http or https port argument in directory scan')

        # initial _url_list_orig.
        # it should prepare a list contains valid targets like:
        # 
        # ['http://github.com:81','http://XX.com:80','https://github.com:9090']
        self._url_list_orig=[]
        for curhost in self._args.target_list:
            for curport in http_portl:
                    self._url_list_orig.append('http://'+curhost+':'+str(curport))
            for curport in https_portl:
                    self._url_list_orig.append('https://'+curhost+':'+str(curport))

        if self._args.dir_maxdepth<0:
            raise SwarmUseException('directory max depth can not be negative')
        if self._args.dir_maxdepth==0:
            self._args.dir_maxdepth=9999
        
        # initial url_list 
        self._url_list=[x+'/' for x in self._url_list_orig]    
        # record current dir depth
        self._curdepth=0

    def generate_subtasks(self):
        """
        Decomposition directory and file scan task and distribute tasks, get result from swarm.
        Task granularity should not be too small or too huge.
        """
        self._curdepth+=1
        # if it is out of range, end this task
        if self._curdepth>self._args.dir_maxdepth:
            return []
        # if there is no dir can be scan, end this task
        if len(self._url_list)==0:
            return []
        # begin discomposition
        if self._args.dir_compbrute==True:
            try:
                # generate list of subtasks 
                subtasklist=generate_compbrute_subtask(self._url_list,self._args.dir_len,
                    self._args.dir_charset,self._args.task_granularity)
            except SwarmParseException as e:
                raise SwarmUseException('invalid directory '+str(e)+', or format error')
        else:
            if dict=='':
                raise SwarmUseException('directory dictionary need to be provided')
            try:
                # generate list of subtasks
                subtasklist=generate_dictbrute_subtask(self._url_list,self._args.dir_dict,
                    self._args.task_granularity)
            except IOError as e:
                raise SwarmUseException('can not open dictionary for directory scan, path:%s'%(dict))
        # clear url_list for next iteration
        self._url_list=[]
        return subtasklist

    def handle_result(self,result):
        if result!='no dir or file':
            result_list=result.split(';')
            # store result into url_list for next scan 
            for x in result_list:
                if x[-1]=='/':
                    self._url_list.append(x)
            # store result into final result
            key='/'.join(result_list[0].split('/')[:3])
            self._args.coll.insert({'domain':key,'content':[x[len(key):] for x in result_list]})


    def report(self):
        for curdomain in self._url_list_orig:
            all_content=[]
            content=self._args.coll.find({'domain':curdomain})
            for cur in content:
                all_content.extend(cur['content'])
            print '============= scan result of '+curdomain+' =============='
            print '\n'.join(all_content)
        print '================================================================='        

class Slave(object):
    """
    Directory and file scanner providing functions of crawler, complete brute force and 
    dictionary brute force.

    Task Format:
        Task:
        target|dict|dict_path|start_line|scan_lines
        target|comp|charset|begin_str|end_str
        Result:
        URL;URL;URL
        no dir
    Example:
        Put task:
        github.com|dict|./dictionary/dir.dict|1000|900
        github.com|comp|ABCabc|A|cccc
        Get result:
        no dir
        https://github.com/something;https://github.com/something/something

    """
    def __init__(self, args):
        super(Slave, self).__init__()
        self._pool=Pool(args.thread_num)
        self._timeout=args.dir_timeout
        self._not_exist_flag=args.dir_not_exist.split(',,')
        self._quick_mode=args.dir_quick_scan

    def do_task(self,task):
        task=task.split('|')
        if task[1]=='dict':
            result=self.dict_brute(task[0],task[2],task[3],task[4])
        else:
            result=self.complete_brute(task[0],task[2],task[3],task[4])
        return result

    def complete_brute(self,target,charset,begin_str,end_str):
        resultl=[]
        bflist=generate_bflist(charset,begin_str,end_str)
        for cur in bflist:
            cur_target=target+cur
            resultl.append(self._pool.apply_async(self._scan_target,args=(cur_target,)))
        return self._get_result(resultl)

    def dict_brute(self,target,dict_path,start_line,lines):
        resultl=[]
        lines=int(lines,10)
        start_line=int(start_line,10)
        with open(dict_path) as fp:
            fcontent=fp.readlines()
            for cur_index in range(lines):
                cur_target=target+fcontent[start_line+cur_index-1].strip()
                resultl.append(self._pool.apply_async(self._scan_target,args=(cur_target,)))
        return self._get_result(resultl)

    def _get_result(self,resultl):
        result=''
        # get result from list
        for cur in resultl:
            try:
                result+=cur.get(timeout=self._timeout)
            except TimeoutError as e:
                continue
        # deal with result
        if result=='':
            result='no dir or file'
        else:
            result=result[:-1]
        return result

    def _scan_target(self,target):
        LOG.debug('scan target: %s'%target)
        if self._quick_mode:
            return self._scan_target_quick(target)
        else:
            return self._scan_target_normal(target)

    def _scan_target_normal(self,target):
        try:
            r=requests.head(target)
            #it maybe a directory
            if self._check_eponymous_dir(r,target):
                return target+'/;'

            if self._check_exist_code(r.status_code):
                # check content
                r=requests.get(target)
                if self._check_exist_code(r.status_code):
                    for cur in self._not_exist_flag:
                        if r.text.find(cur)!=-1:
                            return ''
                    return target+';'
            return ''
        except Exception as e:
            return ''

    def _scan_target_quick(self,target):
        try:
            r=requests.head(target)
            #it maybe a directory
            if self._check_eponymous_dir(r,target):
                return target+'/;'

            if self._check_exist_code(r.status_code):
                return target+';'
            else:
                return ''
        except Exception as e:
            return ''

    def _check_exist_code(self,code):
        if code<300 or code==401 or code==403:
            return True
        return False

    def _check_eponymous_dir(self,r,target):
        """
        Args:
            r: response of request to URL 'target'
            target: URL of one file which wait to be check wether exist eponymous directory
        Returns:
            True if the response means there is a eponymous directory with target file.
            Otherwise return False.
        """
        if r.status_code==301:
            if r.headers['Location']==target+'/':
                return True
            else:
                targetl=target.split('/')
                targetl[2]=targetl[2].split(':')[0]
                target='/'.join(targetl)
                if r.headers['Location']==target+'/':
                    return True
        return False